In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_squared_error
%matplotlib inline

# **EDA**

In [None]:
df1 = pd.read_csv('../input/food-prices-in-turkey/train.csv')
df2 = pd.read_csv('../input/food-prices-in-turkey/test.csv')

In [None]:
df1.shape, df2.shape

In [None]:
df1.isna().sum()

In [None]:
df2.isna().sum()

In [None]:
df = pd.concat([df1, df2], axis=0)

In [None]:
df.shape

In [None]:
df

In [None]:
df.columns

In [None]:
df.drop(['ProductId', 'UmId'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['ProductName'].value_counts()

In [None]:
cols_to_replace = df['ProductName'].value_counts()[df['ProductName'].value_counts() < 50].keys()

In [None]:
df['ProductName'].replace(cols_to_replace,['Others' for i in range(6)],inplace=True)

In [None]:
df['ProductName'].value_counts()['Others']

In [None]:
plt.figure(figsize=(15,15))
df['ProductName'].value_counts().plot(kind='barh')
plt.gca().invert_yaxis()
plt.show()

In [None]:
df.dtypes

In [None]:
df.Place.value_counts().plot(kind='barh')
plt.gca().invert_yaxis()
plt.show()


In [None]:
df.UmName.value_counts()

In [None]:
df['Place'].value_counts().plot(kind='barh')
plt.gca().invert_yaxis()
plt.show()

In [None]:
df_dummies = pd.get_dummies(df[['Place', 'ProductName', 'UmName']],drop_first=True)

In [None]:
df_dummies.columns

In [None]:
df['Year'] = df['Year']/10000
df['Month'] = df['Month']/10000


In [None]:
df.head()

# **One Hot Encoding**

In [None]:
df_main = pd.concat([df_dummies, df[['Month', 'Year', 'Price']]], axis=1)

In [None]:
df_main.shape

In [None]:
df_main.head()

# **Splitting and Training**

In [None]:
x, y = df_main.drop(['Price'],axis=1), df_main['Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

# **Model Build and Predictions**

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
y_pred_test = model.predict(x_test)
y_pred_train = model.predict(x_train)

In [None]:
model.score(x_test, y_test)

In [None]:
model.score(x_train, y_train)

In [None]:
mean_squared_error(y_test, y_pred_test)

In [None]:
mean_squared_error(y_train, y_pred_train)

# **Applying KNRegressor**

In [None]:
model_kr = KNeighborsRegressor(n_neighbors=3,metric='euclidean',weights='distance')
model_kr.fit(x_train, y_train)

In [None]:
model_kr.score(x_test, y_test)

In [None]:
model_kr.score(x_train, y_train)

In [None]:
clf = GridSearchCV(KNeighborsRegressor(),
                   param_grid={
                       'metric':['euclidean','minkowski','manhattan'],
                       'weights':['uniform','distance'],
                       'n_neighbors':[3,5,7,11]
                   },
                   cv=7
                  )

In [None]:
clf.fit(x_train, y_train)

In [None]:
cv_result = pd.DataFrame(clf.cv_results_)
cv_result

In [None]:
cv_result[['param_weights','params','mean_test_score']]

# **Best Estimator and Model Prediction**

In [None]:
clf.best_estimator_

In [None]:
model_kr2 = KNeighborsRegressor(metric='manhattan', n_neighbors=3, weights='distance')
model_kr2.fit(x_train, y_train)

In [None]:
model_kr2.score(x_test, y_test)

In [None]:
model_kr2.score(x_train, y_train)

In [None]:
y_pred_test_kr = model_kr2.predict(x_test)
y_pred_train_kr = model_kr2.predict(x_train)

In [None]:
mean_squared_error(y_test, y_pred_test_kr)

In [None]:
mean_squared_error(y_train, y_pred_train_kr)

In [None]:
pred_kn_test = pd.DataFrame({'Y test KN':y_test,'Y predicted KN':y_pred_test_kr})
pred_kn_train = pd.DataFrame({'Y Train KN':y_train,'Y predicted KN':y_pred_train_kr})

# **Correlation Test and Predicted**

In [None]:
pred_kn_test.corr()

In [None]:
pred_kn_train.corr()

In [None]:
pred_kn_test.sample(10)

In [None]:
x_train.to_csv('MyTrain.csv',index=False)
x_test.to_csv('MyTest.csv',index=False)

In [None]:
import os
os.listdir()