## IMPORTS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
np.random.seed(0)
warnings.filterwarnings('ignore')


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
df = pd.read_csv('../input/bigmart-sales-data/Train.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df[df['Outlet_Size'].isnull()]

In [None]:
df[df['Item_Weight'].isnull()]

In [None]:
df.groupby("Item_Type")["Item_Weight"].mean()


## Pre Processing

In [None]:
mean = df.groupby('Item_Type')['Item_Weight'].mean()
for i in range(len(mean)):
    c1 = (df['Item_Type']==mean.index[i])&(df['Item_Weight'].isna()==True)
    df['Item_Weight'] = np.select([c1], [mean[i]], df['Item_Weight'])

In [None]:
from statistics import mode
df['Outlet_Size'].fillna(mode(df['Outlet_Size']),inplace=True)

In [None]:
df.head()

In [None]:
df["Item_Fat_Content"].unique()

In [None]:
fat_content = {"low fat": "Low Fat",
               "LF": "Low Fat",
               "reg":"Regular"}
df["Item_Fat_Content"]= df["Item_Fat_Content"].replace(fat_content)

In [None]:
cols =["Item_Type","Outlet_Identifier","Outlet_Establishment_Year","Outlet_Size","Outlet_Location_Type","Outlet_Type"]
for i in cols:
    print(df[i].unique())

## Exploratory Data Analysis

In [None]:
sns.pairplot(df, hue = "Item_Fat_Content");

In [None]:
sns.countplot(df['Item_Fat_Content']);

In [None]:
sns.distplot(df['Item_Outlet_Sales']);

In [None]:
sns.scatterplot(x='Item_Visibility',y='Item_Outlet_Sales',data=df);

In [None]:
sns.countplot(df['Outlet_Identifier']);
plt.xticks(rotation = 45)
plt.show()

In [None]:
sns.scatterplot(x='Item_MRP',y='Item_Outlet_Sales',data=df);

In [None]:
sns.boxplot(x='Outlet_Identifier',y='Item_Outlet_Sales',data =df);
plt.xticks(rotation = 90)
plt.show()

In [None]:
sns.countplot(x='Item_Type',data=df);
plt.xticks(rotation = 90)
plt.show()

In [None]:
sns.barplot(y='Item_Outlet_Sales',x='Outlet_Type',data=df);
plt.xticks(rotation = 90)
plt.show()

In [None]:
sns.heatmap(df.corr(),annot=True);

In [None]:
sns.countplot(x="Outlet_Establishment_Year", data = df);

In [None]:
sns.countplot(x="Outlet_Location_Type", data = df);

In [None]:
sns.countplot(x="Outlet_Size", data = df);

In [None]:
df.head()

## Model Building

In [None]:
le = LabelEncoder()
df['Outlet'] = le.fit_transform(df['Outlet_Identifier'])
cat_col = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
for col in cat_col:
    df[col] = le.fit_transform(df[col])

In [None]:
df = pd.get_dummies(df, columns=['Item_Fat_Content', 
                                 'Outlet_Size', 
                                 'Outlet_Location_Type', 
                                 'Outlet_Type'])

In [None]:
X = df.drop(columns=['Outlet_Establishment_Year', 'Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2)

In [None]:
rf = ExtraTreesRegressor()
rf.fit(X,y)
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
print("MSE : ", mse)
coef = pd.Series(rf.feature_importances_, X.columns).sort_values(ascending=False)
coef.plot(kind='bar', title="Feature Importance" , color = 'red')
plt.show()