In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
train = pd.read_csv('Train_BigMart.csv')

In [None]:
train['Item_Fat_Content'].unique()

In [None]:
train['Outlet_Establishment_Year'].unique()

In [None]:
train['Outlet_Size'].unique()

In [None]:
train.describe()

In [None]:
train['Item_Visibility'].hist(bins=20)

In [None]:
train['Item_Fat_Content'].value_counts()

In [None]:
train['Outlet_Size'].value_counts()

In [None]:
train.boxplot(column='Item_MRP', by='Outlet_Size')

In [None]:
train.boxplot(column='Item_Visibility', by='Outlet_Type')

In [None]:

train['Outlet_Size'].mode()[0]

In [None]:

# fill the na for outlet size with medium
train['Outlet_Size'] = train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])

In [None]:

# fill the na for item weight with the mean of weights
train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Weight'].mean())

In [None]:
train.boxplot(column='Item_Visibility')

In [None]:
# delete the observations

Q1 = train['Item_Visibility'].quantile(0.25)
Q3 = train['Item_Visibility'].quantile(0.75)
IQR = Q3 - Q1
filt_train = train.query('(@Q1 - 1.5 * @IQR) <= Item_Visibility <= (@Q3 + 1.5 * @IQR)')

In [None]:
filt_train.shape, train.shape

In [None]:
train = filt_train
train.shape

In [None]:
train['Item_Visibility_bins'] = pd.cut(train['Item_Visibility'], [0.000, 0.065, 0.13, 0.2], labels=['Low Viz', 'Viz', 'High Viz'])

In [None]:
train['Item_Visibility_bins'] = train['Item_Visibility_bins'].replace(NaN, 'Low Viz')

In [None]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace(['low fat', 'LF'], 'Low Fat')

In [None]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace('reg', 'Regular')

In [None]:
#choosing the Fat content, item vizibility bins, outlet size, loc type and type for LABEL ENCODER
le = LabelEncoder()

In [None]:
train['Item_Fat_Content'].unique()

In [None]:
train['Item_Fat_Content'] = le.fit_transform(train['Item_Fat_Content'])

In [None]:
train['Item_Visibility_bins'] = le.fit_transform(train['Item_Visibility_bins'])

In [None]:
train['Outlet_Size'] = le.fit_transform(train['Outlet_Size'])

In [None]:
train['Outlet_Location_Type'] = le.fit_transform(train['Outlet_Location_Type'])

In [None]:
# create dummies for outlet type
dummy = pd.get_dummies(train['Outlet_Type'])
dummy.head()

In [None]:
train = pd.concat([train, dummy], axis=1)

In [None]:
# in linear regression that correlated features should not be present

train.corr()[((train.corr() < -0.85) | (train.corr() > 0.85)) & (train.corr() != 1)]

In [None]:
train.dtypes

In [None]:
# got to drop all the object types features
train = train.drop(['Item_Identifier', 'Item_Type', 'Outlet_Identifier', 'Outlet_Type'], axis=1)

In [None]:
train.columns

In [None]:
# build the linear regression model
X = train.drop('Item_Outlet_Sales', axis=1)
y = train.Item_Outlet_Sales

In [None]:
test = pd.read_csv('Test_BigMart.csv')
test['Outlet_Size'] = test['Outlet_Size'].fillna('Medium')

In [None]:
test['Item_Visibility_bins'] = pd.cut(test['Item_Visibility'], [0.000, 0.065, 0.13, 0.2], labels=['Low Viz', 'Viz', 'High Viz'])

In [None]:
test['Item_Weight'] = test['Item_Weight'].fillna(test['Item_Weight'].mean())

In [None]:
test['Item_Visibility_bins'] = test['Item_Visibility_bins'].replace(NaN, 'Low Viz')
test['Item_Visibility_bins'].head()

In [None]:
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace(['low fat', 'LF'], 'Low Fat')
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace('reg', 'Regular')

In [None]:
test['Item_Fat_Content'] = le.fit_transform(test['Item_Fat_Content'])

In [None]:
test['Item_Visibility_bins'] = le.fit_transform(test['Item_Visibility_bins'])

In [None]:
test['Outlet_Size'] = le.fit_transform(test['Outlet_Size'])

In [None]:
test['Outlet_Location_Type'] = le.fit_transform(test['Outlet_Location_Type'])

In [None]:
dummy = pd.get_dummies(test['Outlet_Type'])
test = pd.concat([test, dummy], axis=1)

In [None]:
X_test = test.drop(['Item_Identifier', 'Item_Type', 'Outlet_Identifier', 'Outlet_Type'], axis=1)

In [None]:
X.columns, X_test.columns

In [None]:
lin = LinearRegression()

In [None]:
lin.fit(X, y)
predictions = lin.predict(X_test)

In [None]:
# # create submission file
# submission = pd.DataFrame(data=[], columns=['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
# submission['Item_Identifier'] = test['Item_Identifier']
# submission['Outlet_Identifier'] = test['Outlet_Identifier']
# submission['Item_Outlet_Sales'] = predictions
# submission.to_csv('submission.csv', index=False)
# submission.head()

In [None]:
# my first score was 1203 points - Linear regression only

In [None]:
# decision tree
dtree_class = DecisionTreeClassifier(criterion='gini', max_depth=25)
y = y.astype(int)

In [None]:
dtree_class.fit(X, y)

In [None]:
accuracy_score(y, dtree_class.predict(X))

In [None]:
r2_score(y, dtree_class.predict(X))

In [None]:
pred = dtree_class.predict(X_test)
pred

In [None]:
# # create submission file
# submission = pd.DataFrame(data=[], columns=['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
# submission['Item_Identifier'] = test['Item_Identifier']
# submission['Outlet_Identifier'] = test['Outlet_Identifier']
# submission['Item_Outlet_Sales'] = pred
# submission.to_csv('submission.csv', index=False)
# submission.head()

In [None]:
# score was 1712 points - Decision Tree Classifier!!!

In [None]:
dtree_reg = DecisionTreeRegressor(criterion='mse', max_depth=10)

In [None]:
dtree_reg.fit(X, y)

In [None]:
pred = dtree_reg.predict(X_test)
pred

In [None]:
# # create submission file
# submission = pd.DataFrame(data=[], columns=['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
# submission['Item_Identifier'] = test['Item_Identifier']
# submission['Outlet_Identifier'] = test['Outlet_Identifier']
# submission['Item_Outlet_Sales'] = pred
# submission.to_csv('submission2.csv', index=False)
# submission.head()

In [None]:
# score was 1289 points - Decision Tree Regression!!!

In [None]:
cross_val_score(lin, X, y, cv=5, scoring='r2')

In [None]:
cross_val_score(dtree_reg, X, y, cv=5, scoring='r2')

In [None]:
# cross_val_score(dtree_class, X, y, cv=5, scoring='roc_auc') - results in an error

In [None]:
r2_score(y, lin.predict(X))

In [None]:
r2_score(y, dtree_reg.predict(X))

In [None]:
avg_pred = (lin.predict(X) + dtree_reg.predict(X)) / 2

In [None]:
r2_score(y, avg_pred)

In [None]:
wavg_pred = lin.predict(X)*0.1 + dtree_reg.predict(X)*0.9

In [None]:
r2_score(y, wavg_pred)

In [None]:
rmf = RandomForestClassifier(n_estimators=100, max_depth=10)

In [None]:
rmf.fit(X, y)

In [None]:
r2_score(y, rmf.predict(X))

In [None]:
accuracy_score(y, rmf.predict(X))