In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
sns.set(color_codes=True)

In [None]:
df = pd.read_csv('../input/amazon-ml-engineer-hiring/train.csv')
test = pd.read_csv("../input/amazon-ml-engineer-hiring/test.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.shape

In [None]:
df.drop("customer_id",axis=1,inplace=True)
test.drop("customer_id",axis=1,inplace=True)

# Unique values

In [None]:
df.nunique()

customer_active_segment,X1,customer_category are Categorical Columns

# Removing Duplicate Rows

In [None]:
df.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)
df.shape,test.shape

## Correlation with Target Variable 

In [None]:
corr_matrix = df.corr()
corr_matrix['customer_category'].sort_values(ascending = False)

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(corr_matrix,annot=True)

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr
correlation(df,0.8)

we will drop this customer_stay_score as it is highly correlated with customer_ctr_score

In [None]:
df.drop('customer_stay_score',axis = 1,inplace=True)
test.drop('customer_stay_score',axis = 1,inplace=True)

## Data Visualization

In [None]:
px.pie(df,names=df.customer_category)

Data is bias towards 0 in customer_category.therefore it is ```Imbalance Dataset```.

In [None]:
px.pie(df,names=df.customer_active_segment)

In [None]:
px.pie(df,names = df.X1)

In [None]:
sns.countplot(data=df,x = df.customer_category)

In [None]:
sns.countplot(data=df,x = df.X1)

In [None]:
sns.countplot(data=df,x = df.customer_active_segment)

## Distribution

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(df.customer_visit_score)
sns.distplot(df.customer_affinity_score)
plt.legend(['customer_visit_score','customer_affinity_score'])

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(df.customer_ctr_score)
sns.distplot(df.customer_order_score)
plt.legend(['customer_ctr_score','customer_order_score'])

In [None]:
sns.distplot(df.customer_product_variation_score)

In [None]:
px.box(df,y = ['customer_ctr_score','customer_order_score','customer_visit_score','customer_affinity_score','customer_product_variation_score'])

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax = sns.boxenplot(x =df['customer_ctr_score'])

fig, ax = plt.subplots(figsize=(10,8))
ax = sns.boxenplot(x = df['customer_order_score'])

fig, ax = plt.subplots(figsize=(10,8))
ax = sns.boxenplot(x = df['customer_visit_score'])

fig, ax = plt.subplots(figsize=(10,8))
ax = sns.boxenplot(x = df['customer_affinity_score'])

plt.show()

In [None]:
px.scatter(df,y = df.customer_affinity_score,x = df.customer_order_score,color=df.customer_category,opacity=1)

There is positive correlation between customer_affinity_score and customer_order_score

In [None]:
px.scatter(df,y = df.customer_product_variation_score,x = df.customer_product_search_score,color=df.customer_category)

In [None]:
px.scatter(df,x = df.customer_product_variation_score,y = df.customer_order_score,color=df.customer_category)

there is negative correlation between  customer_order_score and customer_product_variation_score

In [None]:
px.scatter(df,y = df.customer_visit_score,x = df.customer_ctr_score,color=df.customer_category)

there is negative correlation between customer_visit_score and customer_ctr_score

In [None]:
px.scatter(df,x = df.customer_order_score,y = df.customer_product_search_score,color=df.customer_category)

In [None]:
px.scatter(df,x = df.customer_product_search_score,y = df.customer_ctr_score,color=df.customer_category)

In [None]:
px.scatter(df,x = df.customer_affinity_score,y = df.customer_product_variation_score,color=df.customer_category)

it is negatively correlated and there are more outliers in customer_affinity_score.

In [None]:
sns.pairplot(df)

# One-Hot Encoding of Categorical Columns

In [None]:
df = pd.get_dummies(df,columns=['X1','customer_active_segment'],drop_first=True)
test = pd.get_dummies(test,columns=['X1','customer_active_segment'],drop_first=True)
df.head()

# Missing Values in Train and Test Dataset

In [None]:
df.isnull().sum()

In [None]:
test.isnull().sum()

## Filling Missing values using Simple Imputer

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values= np.NaN,strategy='median')
imputer.fit(df)
x = imputer.transform(df)
imputer.fit(test)
y = imputer.transform(test)

In [None]:
df = pd.DataFrame(x,columns=df.columns)
test = pd.DataFrame(y,columns=test.columns)

In [None]:
df.isnull().sum()

In [None]:
test.isnull().sum()

# Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
X_temp = df.drop(['customer_category'],axis=1)
Y_temp = df['customer_category']
model = ExtraTreesRegressor()
model.fit(X_temp,Y_temp)
feat_import = pd.Series(model.feature_importances_,index = X_temp.columns)
feat_import.nlargest(16).plot(kind='bar')
plt.show()

# Splitting The Dataset

In [None]:
X = df.drop(['customer_category'],axis=1)
Y = df['customer_category']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 42,stratify = Y)

In [None]:
Y_test.value_counts()

In [None]:
Y_train.value_counts()

In [None]:
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
ms = StandardScaler()
X_train = ms.fit_transform(X_train)
X_test = ms.transform(X_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train,Y_train)

In [None]:
cat = CatBoostClassifier(iterations=500,task_type='GPU')
cat.fit(X_train,Y_train,verbose=True)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)

In [None]:
params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01}
gd =GradientBoostingClassifier(**params)
gd.fit(X_train,Y_train)

In [None]:
xg = XGBClassifier(n_estimators = 3000,learning_rate=0.01)
xg.fit(X_train,Y_train)

In [None]:
lgb = LGBMClassifier()
lgb.fit(X_train,Y_train,eval_set = (X_test,Y_test),early_stopping_rounds=1000)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]



random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}




rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,
                               scoring='accuracy',
                              n_iter=10,cv=5,verbose=2,random_state=42,n_jobs=1)

rf_random.fit(X_train,Y_train)

In [None]:
rf_random.best_params_

In [None]:
final_test = ms.transform(test)

In [None]:
pred = rf_random.predict(final_test)

In [None]:
test_lr = lr.predict(final_test)
test_rf = rf.predict(final_test)
test_rf_random = rf_random.predict(final_test)
test_gd = gd.predict(final_test)
test_lgb = lgb.predict(final_test)

final_pred = (test_lr*0.6 + test_rf*0.2  + test_rf_random*0.1 + test_gd*0.1 )

In [None]:
test = pd.read_csv("../input/amazon-ml-engineer-hiring/test.csv")
submit = pd.DataFrame()
submit['customer_id'] = test['customer_id']
submit['customer_category'] = final_pred
submit['customer_category'] = submit['customer_category'].astype(int)

In [None]:
submit.to_csv('submission.csv',index=False)

In [None]:
submit.head()