In [None]:
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import tensorflow as tf
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from warnings import filterwarnings
filterwarnings('ignore')
from kerastuner.tuners import RandomSearch

In [None]:
df=pd.read_csv('../input/rideshare_kaggle.csv') 

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df=df.drop(['id'],axis=1)

In [None]:
df.head()

In [None]:
df=df.drop(['timestamp','datetime','windGustTime','temperatureHigh','temperatureHighTime','temperatureLow','temperatureLowTime',
           'apparentTemperatureHigh','apparentTemperatureHighTime','apparentTemperatureLow','apparentTemperatureLowTime',
            'sunriseTime','sunsetTime','precipIntensityMax','uvIndexTime','temperatureMin','temperatureMinTime','temperatureMax',
            'temperatureMaxTime','apparentTemperature','apparentTemperatureMin','apparentTemperatureMinTime','apparentTemperatureMax'
           ,'apparentTemperatureMaxTime','long_summary','icon','product_id','timezone','visibility.1','destination'],axis=1)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:

plt.figure(figsize=(25, 10))

sns.heatmap(df.isnull(), cbar=False)

plt.show()

In [None]:
df=df[df['price'].isnull()==False]

In [None]:
df.isnull().sum()

In [None]:

plt.figure(figsize=(25, 10))

sns.heatmap(df.isnull(), cbar=False)

plt.show()

In [None]:
df.shape

In [None]:
num_col=df.select_dtypes(include='number')

In [None]:
num_col.shape

In [None]:
cat_col=df.select_dtypes(include='object')

In [None]:
cat_col.shape

In [None]:
cat_col.head()

In [None]:
plt.figure(figsize=(60,55))
plt.rcParams.update({'font.size': 40})
j=1
for column in num_col:
    plt.subplot(8,3,j)
    sns.distplot(num_col[column])
    plt.tight_layout()
    j=j+1

In [None]:
plt.figure(figsize=(60,55))
plt.rcParams.update({'font.size': 40})
j=1
for column in num_col:
    if column=='price':
        continue
    else:
        plt.subplot(8,3,j)
        sns.scatterplot(x=num_col[column],y=num_col['price'])
        plt.tight_layout()
        j=j+1

In [None]:
cat_col1=cat_col.copy()

In [None]:
cat_col['price']=df['price']

In [None]:
plt.figure(figsize=(70,69))
plt.rcParams.update({'font.size': 40})
j=1
for column in cat_col:
    if column=='price':
        continue
    else:
        plt.subplot(8,1,j)
        sns.boxplot(x=cat_col[column],y=cat_col['price'])
        plt.tight_layout()
        j=j+1

In [None]:
corr=num_col.corr()

In [None]:
# set figure size
plt.figure(figsize=(30, 30))

# set condition to get a strong correlation between the variables
sns.set(font_scale=2)
sns.heatmap(corr[(corr >= 0.7) | (corr <= -0.7)],
            vmax=1.0, vmin=-1.0, linewidths=0.1,
           annot=True, annot_kws={"size": 18}, square=True);

# specify name of the plot
plt.title('Correlation between numeric features')
plt.show()

In [None]:
plt.figure(figsize=(60,55))
plt.rcParams.update({'font.size': 40})
j=1
for column in num_col:
    plt.subplot(8,3,j)
    sns.boxplot(x=num_col[column])
    plt.tight_layout()
    j=j+1

In [None]:
data2=df[['price','distance','latitude','temperature','windGust','visibility','dewPoint']]

In [None]:
#Outlier Treatment
#Outlier Treatment
#Capping the outliers #Winsorization
for i in data2.columns:
    q1=data2[i].quantile(0.25)
    q3=data2[i].quantile(0.75)
    iqr=q3-q1
    ub=q3 + 1.5*iqr
    lb=q1 - 1.5*iqr
    uc=data2[i].quantile(0.99)
    lc=data2[i].quantile(0.01)
    for ind1 in data2[i].index:
        if data2.loc[ind1, i] >ub:            
            data2.loc[ind1, i] =uc
        if data2.loc[ind1, i] < lb:
            data2.loc[ind1, i] =lc

In [None]:
plt.figure(figsize=(60,55))
plt.rcParams.update({'font.size': 40})
j=1
for column in data2:
    plt.subplot(8,3,j)
    sns.boxplot(x=data2[column])
    plt.tight_layout()
    j=j+1

In [None]:
for column in data2:
    df[column]=data2[column]

In [None]:
num_col=df.select_dtypes(include='number')

In [None]:
plt.figure(figsize=(60,55))
plt.rcParams.update({'font.size': 40})
j=1
for column in num_col:
    plt.subplot(8,3,j)
    sns.boxplot(x=num_col[column])
    plt.tight_layout()
    j=j+1

In [None]:
cat_col

In [None]:
df.source.value_counts()

In [None]:
df.cab_type.value_counts()

In [None]:
df.name.value_counts()

In [None]:
df.short_summary.value_counts()

In [None]:
df=pd.get_dummies(df,columns=['cab_type'],drop_first=True)

In [None]:
df=pd.get_dummies(df,columns=['short_summary'],drop_first=True)

In [None]:
df=pd.get_dummies(df,columns=['name'],drop_first=True)

In [None]:
df=pd.get_dummies(df,columns=['source'],drop_first=True)

In [None]:
df.head()

# **Base Model**

In [None]:
data=df.copy()

In [None]:
X=data.drop(['price'],axis=1)
Y=data['price']
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=48)

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(x_train,y_train)

In [None]:
pred=lm.predict(x_test)

In [None]:
r2_score(y_test,pred)

In [None]:
num_col=num_col-num_col.mean()

In [None]:
 vif=pd.DataFrame()

In [None]:
vif['VIF']=[variance_inflation_factor(num_col.values,i) for i in range(num_col.shape[1])] 
vif['feature']=num_col.columns 
vif.sort_values('VIF',ascending=False)

In [None]:
num_col=num_col.drop(['dewPoint'],axis=1)

In [None]:
vif1=pd.DataFrame()
vif1['VIF']=[variance_inflation_factor(num_col.values,i) for i in range(num_col.shape[1])] 
vif1['feature']=num_col.columns 
vif1.sort_values('VIF',ascending=False)

In [None]:
num_col=num_col.drop(['month'],axis=1)

In [None]:
vif1=pd.DataFrame()
vif1['VIF']=[variance_inflation_factor(num_col.values,i) for i in range(num_col.shape[1])] 
vif1['feature']=num_col.columns 
vif1.sort_values('VIF',ascending=False)

In [None]:
num_col=num_col.drop(['windSpeed'],axis=1)

In [None]:
vif1=pd.DataFrame()
vif1['VIF']=[variance_inflation_factor(num_col.values,i) for i in range(num_col.shape[1])] 
vif1['feature']=num_col.columns 
vif1.sort_values('VIF',ascending=False)

In [None]:
num_col=num_col.drop(['pressure'],axis=1)

In [None]:
vif1=pd.DataFrame()
vif1['VIF']=[variance_inflation_factor(num_col.values,i) for i in range(num_col.shape[1])] 
vif1['feature']=num_col.columns 
vif1.sort_values('VIF',ascending=False)

In [None]:
num_col=num_col.drop(['precipProbability'],axis=1)

In [None]:
vif1=pd.DataFrame()
vif1['VIF']=[variance_inflation_factor(num_col.values,i) for i in range(num_col.shape[1])] 
vif1['feature']=num_col.columns 
vif1.sort_values('VIF',ascending=False)

In [None]:
data=data.drop(['dewPoint','month','windSpeed','pressure','precipProbability'],axis=1)

# **Model-2**

In [None]:
X=data.drop(['price'],axis=1)
Y=data['price']
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=48)

In [None]:
lm1=LinearRegression()

In [None]:
lm1.fit(x_train,y_train)

In [None]:
pred=lm1.predict(x_test)

In [None]:
r2_score(y_test,pred)