In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True,style='darkgrid')

## Problem Statement
Welcome to Sigma Cab Private Limited - a cab aggregator service. Their customers can download their app on smartphones and book a cab from any where in the cities they operate in. They, in turn search for cabs from various service providers and provide the best option to their client across available options. They have been in operation for little less than a year now. During this period, they have captured surgepricingtype from the service providers.

You have been hired by Sigma Cabs as a Data Scientist and have been asked to build a predictive model, which could help them in predicting the surgepricingtype pro-actively. This would in turn help them in matching the right cabs with the right customers quickly and efficiently.



## Data
Variable Definition
TripID - ID for TRIP (Can not be used for purposes of modelling) TripDistance - The distance for the trip requested by the customer
TypeofCab - Category of the cab requested by the customer
CustomerSinceMonths - Customer using cab services since n months; 0 month means current month
LifeStyleIndex - Proprietary index created by Sigma Cabs showing lifestyle of the customer based on their behaviour
ConfidenceLifeStyleIndex - Category showing confidence on the index mentioned above DestinationType - Sigma Cabs divides any destination in one of the 14 categories
CustomerRating - Average of life time ratings of the customer till date CancellationLast1Month - Number of trips cancelled by the customer in last 1 month Var1, Var2 and Var3 - Continuous variables masked by the company. Can be used for modelling purposes Gender - Gender of the customer SurgePricing_Type - Predictor variable can be of 3 types

In [None]:
file = '/kaggle/input/sigmacabprediction/SigmaCab-Train.csv'
df = pd.read_csv(file)

In [None]:
df.head()

We dont need to use Trip ID for our modelling, So we drop this column.

In [None]:
df = df.drop(['Trip_ID'],axis=1)

In [None]:
df.describe(include='all').transpose()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

There are lot of missing values we should deal with.

In [None]:
df.hist(figsize=(15,10))

In [None]:
sns.countplot(df['Surge_Pricing_Type'],hue=df['Gender'])

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.countplot(df['Type_of_Cab'])

plt.subplot(1,2,2)
sns.countplot(df['Type_of_Cab'],hue=df['Surge_Pricing_Type'])

In [None]:
sns.countplot(df['Destination_Type'])

In [None]:
sns.boxplot(df['Life_Style_Index'])

In [None]:
sns.pointplot(df['Surge_Pricing_Type'],df['Cancellation_Last_1Month'])

In [None]:
sns.boxplot(df['Surge_Pricing_Type'],df['Customer_Rating'])

In [None]:
sns.boxplot(df['Surge_Pricing_Type'],df['Var3'])

In [None]:
df.groupby(['Destination_Type'])['Surge_Pricing_Type'].value_counts()

In [None]:
df.groupby(['Destination_Type'])['Surge_Pricing_Type'].value_counts().plot()

In [None]:
df['Destination_Type'].value_counts()

In [None]:
sns.boxplot(df['Surge_Pricing_Type'],df['Life_Style_Index'])

In [None]:
sns.countplot(df['Customer_Since_Months'],hue=df['Surge_Pricing_Type'])

In [None]:
sns.boxplot(df['Surge_Pricing_Type'],df['Trip_Distance'])

In [None]:
sns.pointplot(df['Surge_Pricing_Type'],df['Trip_Distance'])

In [None]:
df.groupby(['Destination_Type'])['Trip_Distance'].mean().sort_values(ascending=False).plot()

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(df['Destination_Type'],df['Trip_Distance'])

In [None]:
df.groupby(['Type_of_Cab','Surge_Pricing_Type'])['Surge_Pricing_Type'].count()

In [None]:
corr = df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,annot=True)

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')

In [None]:
imputed_Toc = imputer.fit_transform(np.array(df['Type_of_Cab']).reshape(-1,1))

In [None]:
Toc = pd.DataFrame(imputed_Toc,columns=['Type_of_Cab'])

In [None]:
Toc.head()

In [None]:
Toc.isna().any()

In [None]:
df['imputed_Toc'] = df['Type_of_Cab'].isna()

In [None]:
df['imputed_Toc'] = df['imputed_Toc'].replace([True,False],[1,0])

In [None]:
df.head()

In [None]:
df = df.drop(['Type_of_Cab'],axis=1)

In [None]:
df = pd.concat([df,Toc],axis=1)

In [None]:
df.head()

In [None]:
df['Customer_Since_Months'].value_counts()

In [None]:
df['imputed_Csm'] = df['Customer_Since_Months'].isna()

In [None]:
df['imputed_Csm'] = df['imputed_Csm'].replace([True,False],[1,0])

In [None]:
df['Customer_Since_Months'] = df['Customer_Since_Months'].fillna(0)

In [None]:
df['Customer_Since_Months'].value_counts()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Destination_Type'].value_counts()

In [None]:
DT = {'A':14,'B':13,'C':12,'D':11,'E':10,'F':9,'G':8,'H':7,'I':6,'J':5,'K':4,'L':3,'N':2,'M':1}

In [None]:
df['Destination_Type'] = df['Destination_Type'].map(DT)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
T = pd.get_dummies(df['Type_of_Cab'],drop_first=True,prefix='Toc')

In [None]:
T

In [None]:
df = pd.concat([df,T],axis=1)

In [None]:
G = pd.get_dummies(df['Gender'],drop_first=True,prefix='G')

In [None]:
df = pd.concat([df,G],axis=1)

In [None]:
df = df.drop(['Type_of_Cab','Gender'],axis=1)

In [None]:
df['Confidence_Life_Style_Index'] = df['Confidence_Life_Style_Index'].replace(np.nan,'mis')

In [None]:
C = pd.get_dummies(df['Confidence_Life_Style_Index'],drop_first=True,prefix='CLSI')

In [None]:
df = pd.concat([df,C],axis=1)
df = df.drop(['Confidence_Life_Style_Index'],axis=1)

In [None]:
df.head()

In [None]:
cols = df.columns

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
imp = IterativeImputer(max_iter=10,random_state=0)

In [None]:
df = imp.fit_transform(df)

In [None]:
df = pd.DataFrame(df,columns=cols)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
corr = df.corr()
plt.figure(figsize=(20,10))
sns.heatmap(corr,annot=True)

In [None]:
df.describe().transpose()

In [None]:
X = df.drop(['Surge_Pricing_Type'],axis=1)
y = df['Surge_Pricing_Type']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

In [None]:
x_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
xs_train = scaler.fit_transform(x_train)
xs_test = scaler.fit_transform(x_test)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log = LogisticRegression(random_state=1)

In [None]:
model1 = log.fit(xs_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [None]:
y_pred1 = model1.predict(xs_test)

In [None]:
accuracy_score(y_test,y_pred1)

In [None]:
f1_score(y_test,y_pred1,average='macro')

## Support Vector Machines

In [None]:
from sklearn.svm import LinearSVC,svc

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier(max_depth=15,ccp_alpha=0.0)

In [None]:
model3 = dtc.fit(x_train,y_train)

In [None]:
y_pred3 = model3.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred3)

In [None]:
f1_score(y_test,y_pred3,average='macro')

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()

In [None]:
model4 = rfc.fit(x_train,y_train)

In [None]:
y_pred4 = model4.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred4)

In [None]:
f1_score(y_test,y_pred4,average='macro')

In [None]:
from sklearn.ensemble import AdaBoostClassifier


In [None]:
abcl = AdaBoostClassifier(n_estimators=300,learning_rate=0.3)

In [None]:
model5 = abcl.fit(x_train,y_train)

In [None]:
y_pred5 = model5.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred5)

In [None]:
f1_score(y_test,y_pred5,average='macro')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbcl  = GradientBoostingClassifier()

In [None]:
model6 = gbcl.fit(x_train,y_train)

In [None]:
y_pred6 = model6.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred6)