# BigMart Sales Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading data

In [None]:
df_train=pd.read_csv("../input/bigmart-sales-data/Train.csv")
df_test=pd.read_csv("../input/bigmart-sales-data/Test.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

### Getting data info

In [None]:
df_train.info()

**Some coloms are having missing values like Item_Weight ,Outlet_Size**

In [None]:
df_test.info()

In [None]:
df_train.shape

**Some coloms are having missing values like Item_Weight ,Outlet_Size**

In [None]:
df_train.describe()

In [None]:
df_test.shape

In [None]:
df_train.duplicated().sum()

**No any duplicate values are there**

In [None]:
df_train.corr()

## data cleaning

In [None]:
# dealing with null values
df_train['Item_Weight']=df_train['Item_Weight'].replace(np.NaN,df_train['Item_Weight'].mean())
df_train=df_train.dropna(subset=['Outlet_Size'])

In [None]:
df_test['Item_Weight']=df_test['Item_Weight'].replace(np.NaN,df_test['Item_Weight'].mean())
df_test=df_test.dropna(subset=['Outlet_Size'])

In [None]:
df_train['Item_Fat_Content']=df_train['Item_Fat_Content'].replace({'low fat':'Low Fat','reg':'Regular','LF':'Low Fat'})

In [None]:
df_train.info()

In [None]:
df_test.info()

**now data is completly cleaned**

# Getting insights

### categorical data

In [None]:
df_train['Item_Type'].value_counts()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(df_train['Item_Type'])
plt.xticks(rotation=90);
plt.title("count of each Item")

In [None]:
df_train['Item_Fat_Content'].value_counts()

In [None]:
sns.countplot(df_train['Item_Fat_Content'])
plt.xticks(rotation=90);
plt.title("Fat type")

In [None]:
df_train['Outlet_Location_Type'].value_counts()

In [None]:
sns.countplot(df_train['Outlet_Location_Type'])
plt.xticks(rotation=90);
plt.title("Fat type")

In [None]:
df_train['Outlet_Size'].value_counts()

In [None]:
sns.countplot(df_train['Outlet_Size'])
plt.xticks(rotation=90);
plt.title("size of outlet")

## Bivriate

In [None]:
plt.figure(figsize=(20,20))
sns.regplot(x=df_train['Item_Weight'],y=df_train['Item_MRP']);
plt.xlabel("Item_Weight")
plt.ylabel("Item_MRP")
plt.title("Item_Weight vs Item_MRP")

In [None]:
plt.figure(figsize=(20,20))
sns.regplot(x=df_train['Item_Visibility'],y=df_train['Item_MRP']);
plt.xlabel("Item_Visibility")
plt.ylabel("Item_MRP")
plt.title("Item_Visibility vs Item_MRP")

# Making Model to predict

In [None]:
predictors=['Item_Weight','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Size','Outlet_Location_Type','Outlet_Type']
target=['Item_Outlet_Sales']

In [None]:
train=df_train.copy()
test=df_test.copy()

In [None]:
train.head()

## For training data

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
lb=LabelEncoder()
categ=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type']
for i in categ:
    train[i]=lb.fit_transform(train[i])


In [None]:
train_data=pd.get_dummies(train, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type'])

In [None]:
train_data.head()

In [None]:
train_data=train_data.drop(['Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year'],axis=1)


In [None]:
train_data.head()

In [None]:
train_data.info()

## For test data

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
lb=LabelEncoder()
categ=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type']
for i in categ:
    test[i]=lb.fit_transform(test[i])


In [None]:
test.head()

In [None]:
test_data=pd.get_dummies(test, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type'])

In [None]:
test.head()

In [None]:
test_data=test_data.drop(['Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year'],axis=1)


In [None]:
test_data=test_data.drop(['Item_Fat_Content_3','Item_Fat_Content_4'],axis=1)

In [None]:
test_data=test_data.drop(['Item_Fat_Content_2'],axis=1)

In [None]:
test_data.head()

In [None]:
test_data.info()

# Fitting Linear Regression

In [None]:
X_train=train_data.drop(['Item_Outlet_Sales'],axis=1)


In [None]:
y_train=train_data.Item_Outlet_Sales

In [None]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(X_train,y_train)

In [None]:
prediction=reg.predict(test_data)

In [None]:
prediction

In [None]:
predicted_data=pd.DataFrame({'Item_Identifier':test['Item_Identifier'],'Item_Outlet_Sales':prediction})

In [None]:
predicted_data.head()

In [None]:
predicted_data.to_csv("Predicted_outlet_sales.csv",index=False)