# Importing libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
    

In [None]:
# Read train data
train = pd.read_csv('/kaggle/input/black-friday/train.csv')

# Read test data
test = pd.read_csv('/kaggle/input/black-friday/test.csv')

**Having a look at train data**

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train.dtypes

In [None]:
train.describe()

**Having alook at test data**

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test.info()

**Missing values**

In [None]:
def missing(df):
    missing_values=df.isnull().sum()
    missing_percentage=missing_values*100/len(df['User_ID'])
    missing_percentage=missing_percentage.sort_values(ascending=False)
    return missing_percentage

In [None]:
missing(train)

In [None]:
missing(test)

# EDA

In [None]:
train['Age'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(train['Age'])

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(train['City_Category'])

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(train['Stay_In_Current_City_Years'])

In [None]:
train['Occupation'].unique()

In [None]:
train['City_Category'].unique()

In [None]:
train['Stay_In_Current_City_Years'].unique()

In [None]:
print(train['Product_Category_1'].unique())
print(train['Product_Category_2'].unique())
print(train['Product_Category_3'].unique())

# Pre-processing****

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
train['User_ID'] = train['User_ID'] - 1000000
test['User_ID'] = test['User_ID'] - 1000000

enc = LabelEncoder()
train['User_ID'] = enc.fit_transform(train['User_ID'])
test['User_ID'] = enc.transform(test['User_ID'])

In [None]:
train['Product_ID'] = train['Product_ID'].str.replace('P00', '')
test['Product_ID'] = test['Product_ID'].str.replace('P00', '')

scaler = StandardScaler()
train['Product_ID'] = scaler.fit_transform(train['Product_ID'].values.reshape(-1, 1))
test['Product_ID'] = scaler.transform(test['Product_ID'].values.reshape(-1, 1))

In [None]:
categorical_col = ['Gender', 'City_Category']
numerical_col = ['Age', 'Occupation', 'Stay_In_Current_City_Years', 'Product_Category_1', 
           'Product_Category_2', 'Product_Category_3']

In [None]:
train['Age']=train['Age'].replace('0-17',17)
train['Age']=train['Age'].replace('18-25',25)
train['Age']=train['Age'].replace('26-35',35)
train['Age']=train['Age'].replace('36-45',45)
train['Age']=train['Age'].replace('46-50',50)
train['Age']=train['Age'].replace('51-55',55)
train['Age']=train['Age'].replace('55+',60)

In [None]:
test['Age']=test['Age'].replace('0-17',17)
test['Age']=test['Age'].replace('18-25',25)
test['Age']=test['Age'].replace('26-35',35)
test['Age']=test['Age'].replace('36-45',45)
test['Age']=test['Age'].replace('46-50',50)
test['Age']=test['Age'].replace('51-55',55)
test['Age']=test['Age'].replace('55+',60)

In [None]:
train['Stay_In_Current_City_Years']=train['Stay_In_Current_City_Years'].replace('4+',4)
test['Stay_In_Current_City_Years']=test['Stay_In_Current_City_Years'].replace('4+',4)

**Filling missing values with zero**

In [None]:
train = train.fillna(0)
test = test.fillna(0)

# Encoding

In [None]:
# Encoding categorical columns

encoder = LabelEncoder()

for col in categorical_col:
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])

In [None]:
# Scaling numerical columns

scaler = StandardScaler()

for col in numerical_col:
    train[col] = scaler.fit_transform(train[col].values.reshape(-1, 1))
    test[col] = scaler.transform(test[col].values.reshape(-1, 1))

In [None]:
train.head()

# Training the model

In [None]:
X = train.drop(['Purchase'], axis=1)
y = train[['Purchase']]
X_test = test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

# 1.LinearRegression

In [None]:
reg=linear_model.LinearRegression()
lm_model=reg.fit(X_train,y_train)
pred=lm_model.predict(X_val)

In [None]:
np.sqrt(mean_squared_error(y_val,pred))

# 2.Xg Boost

In [None]:
xgb_reg = XGBRegressor(learning_rate=1.0, max_depth=6, min_child_weight=40, seed=0)

xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_pred, y_val))

print (xgb_reg)

In [None]:
rmse

**Choose xgboost over linear regression due to less RMSE **

# If you find the kernel useful, upvote it :)