In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

In [None]:
train = pd.read_csv('../input/big-mart-sales-prediction/Train.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
train.isnull().sum()

In [None]:
train.isnull().sum()/train.count()*100

In [None]:
train.groupby(['Outlet_Identifier','Outlet_Size']).count()

In [None]:
train = train.drop('Outlet_Size', axis =1)

In [None]:
train.groupby('Item_Identifier').mean().sort_values('Item_Weight')

In [None]:
train[train['Item_Identifier'].isin(['FDE52','FDK57','FDN52','FDQ60'])]

In [None]:
train = train.drop(train[train['Item_Identifier'].isin(['FDE52','FDK57','FDN52','FDQ60'])].index)

In [None]:
train[train['Item_Identifier']=='FDX49']

In [None]:
Item_Spec = train.groupby(['Item_Identifier', 'Item_Weight']).sum().reset_index()[['Item_Identifier','Item_Weight']]
Item_Spec = pd.Series(Item_Spec['Item_Weight'].values, index=Item_Spec['Item_Identifier']).to_dict()

In [None]:
Item_Spec

In [None]:
train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Identifier'].map(Item_Spec))

In [None]:
train.isnull().sum()

In [None]:
train['Item_Fat_Content'].unique()

In [None]:
Fat_Content = {'Low Fat':'Low Fat', 'low fat':'Low Fat', 'LF':'Low Fat', 'Regular':'Regular', 'reg':'Regular'}

In [None]:
Fat_Content

In [None]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].map(Fat_Content)

In [None]:
train['Item_Fat_Content'].unique()

In [None]:
train

In [None]:
plt.figure(figsize=(14,6))
sns.distplot(train['Item_Outlet_Sales'], bins =14)

In [None]:
data = train
index = train.groupby('Item_Type').nunique().index
fig = px.pie(data,names='Item_Type',  title='zz')
fig.show()

In [None]:
plt.figure(figsize=(14,6))
train.groupby('Item_Type').nunique()['Item_Identifier'].plot(kind='bar')

In [None]:
fig = px.histogram(train,
                   x="Item_Weight", 
                   color="Item_Fat_Content", 
                   marginal="box",
                   #title='Reading Score - Gender', 
                   #barmode='overlay',
                   nbins=20
                  )
fig.update_layout(yaxis=dict(title=''))
fig.show()

In [None]:
fig = px.histogram(train,
                   x='Item_Type', 
                   y='Item_Outlet_Sales',
                   color="Item_Type", 
                   histfunc='avg',
                   nbins=20
                  ).update_xaxes(categoryorder='total descending')
#fig.updatelayout(xaxis=dict())
fig.show()

In [None]:
fig = px.scatter(train,
                 x='Item_MRP', 
                 y='Item_Outlet_Sales',
                )
fig.show()

In [None]:
fig = px.scatter(train,
                 x='Outlet_Establishment_Year', 
                 y='Outlet_Location_Type',
                 color='Outlet_Type',
                 symbol='Outlet_Type',
                 text='Outlet_Identifier',
                ).update_yaxes(categoryorder='total ascending')

fig.update_traces(marker=dict(size=12,),
                  textposition='top center',
                  textfont=dict(family='Arial',size=12),
              
                 )
fig.update_layout(
    height=600,
)

fig.show()

In [None]:
fig3 = plt.figure(constrained_layout=True,figsize=(14,12))
gs = fig3.add_gridspec(3, 4)
f3_ax1 = fig3.add_subplot(gs[0, 0:2])
f3_ax2 = fig3.add_subplot(gs[0, 2:4])
f3_ax3 = fig3.add_subplot(gs[1, :])
f3_ax4 = fig3.add_subplot(gs[2, :])

sns.boxplot(x='Outlet_Location_Type',
            y='Item_Outlet_Sales',
            data = train,
            order=['Tier 1', 'Tier 2', 'Tier 3'],
            ax=f3_ax1,        
           )

sns.boxplot(x='Outlet_Type',
            y='Item_Outlet_Sales',
            data=train,
            order=['Grocery Store', 'Supermarket Type1', 'Supermarket Type2', 'Supermarket Type3'],
            ax=f3_ax2
           )

sns.boxplot(x='Outlet_Establishment_Year',
            y='Item_Outlet_Sales',
            data = train,
            ax=f3_ax3
           )

sns.boxplot(x='Outlet_Identifier',
            y='Item_Outlet_Sales',
            data=train,
            order=['OUT019', 'OUT010', 'OUT018', 'OUT049', 'OUT035', 'OUT045', 'OUT017', 'OUT046', 'OUT013', 'OUT027'],
            ax=f3_ax4
           )



In [None]:
train.corr()

In [None]:
train.nunique()

In [None]:
data_train = train[['Item_Fat_Content', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Location_Type', 'Outlet_Type','Item_Outlet_Sales']]

In [None]:
data_train = pd.get_dummies(data_train)

In [None]:
data_train.corr()['Item_Outlet_Sales']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.pipeline import make_pipeline

In [None]:
X = data_train.drop('Item_Outlet_Sales', axis=1).values
y = data_train['Item_Outlet_Sales'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
predictions = lm.predict(X_test)
plt.scatter(y_test, predictions)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, predictions))

In [None]:
train.describe()

In [None]:
reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))

In [None]:
reg.fit(X_train, y_train)

In [None]:
SGD_Predictions = reg.predict(X_test)

In [None]:
plt.scatter(y_test, SGD_Predictions)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, SGD_Predictions))

In [None]:
ls = Lasso(alpha=1)

In [None]:
ls.fit(X_train, y_train)

In [None]:
Lasso_Predictions = ls.predict(X_test)

In [None]:
plt.scatter(y_test, Lasso_Predictions)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, Lasso_Predictions))

In [None]:
result = [10000,0,0]
for sample in range(1,40):
    for leaf in range(1,sample):
        DCT = DecisionTreeRegressor(min_samples_split=sample, min_samples_leaf=leaf)
        DCT.fit(X_train, y_train)
        DecisionTree_Predictions = DCT.predict(X_test)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, DecisionTree_Predictions))
        if rmse < result[0]:
            result[0] = rmse
            result[1] = sample
            result[2] = leaf
    

In [None]:
result

In [None]:
DCT = DecisionTreeRegressor(min_samples_split=37, min_samples_leaf=36)
DCT.fit(X_train, y_train)
DecisionTree_Predictions = DCT.predict(X_test)
plt.scatter(y_test, DecisionTree_Predictions)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, DecisionTree_Predictions))