### Import libraries and configure Settings

In [None]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 

In [None]:
# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [None]:
#!pip install missingno
#!pip install statsmodels

In [None]:

import scipy.stats as stats 

import warnings
warnings.filterwarnings('ignore') # To supress warnings

# set the background for the graphs
from scipy.stats import skew
plt.style.use('ggplot')

import missingno as msno # to get visualization on missing values

from sklearn.model_selection import train_test_split # Sklearn package's randomized data splitting function
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import math

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth',400)
pd.set_option('display.float_format', lambda x: '%.5f' % x) # To supress numerical display in scientific notations

import statsmodels.api as sm
print("Import Libraries --> COMPLETED")

In [None]:
# reading the CSV file into pandas dataframe
cp_df = pd.read_csv("Training_Data_Set.csv")

In [None]:
cp_df.shape

In [None]:
# Check top few records to get a feel of the data structure
cp_df.head()

In [None]:
#get the size of dataframe
print ("Rows     : " , cp_df.shape[0])  #get number of rows/observations
print ("Columns  : " , cp_df.shape[1]) #get number of columns
print ("#"*40,"\n","Features : \n\n", cp_df.columns.tolist()) #get name of columns/features
print ("#"*40,"\nMissing values :\n\n", cp_df.isnull().sum().sort_values(ascending=False))
print( "#"*40,"\nPercent of missing :\n\n", round(cp_df.isna().sum() / cp_df.isna().count() * 100, 2)) # looking at columns with most Missing Values
print ("#"*40,"\nUnique values :  \n\n", cp_df.nunique())  #  count of unique values

In [None]:
cp_df.info()

In [None]:
msno.bar(cp_df)

In [None]:
# Making a list of all categorical variables
cat_col = [
    "fuel_type",
    "Location",
    "transmission",
    "body_type",
    "Owner Type",
    "seat_count"
    
]

# Printing number of count of each unique value in each column
for column in cat_col:
    print(cp_df[column].value_counts())
    print()
    print("#" * 40)

In [None]:
cp_df['manufacture_year'].value_counts()

In [None]:
cp_df.describe().transpose()

In [None]:
# Null Value Columns
cp_df.isnull().sum()

In [None]:
print(cp_df.columns)  # Print out the column names
print(cp_df.dtypes)   # Print out the data types of each column

In [None]:
# Replace Null Values in Column "Distance" with median distance because mean is impacted by outliers
cp_df['Distance '] = cp_df.groupby(['Maker', 'model','manufacture_year'])['Distance '].transform(lambda x: x.fillna(x.median()))

In [None]:
cp_df['Distance '].fillna(cp_df['Distance '].median(), inplace=True)

In [None]:
# Calculate median door_count grouped by Maker and model
cp_df['door_count']=cp_df['door_count'].replace({'None':np.nan})
cp_df['seat_count']=cp_df['seat_count'].replace({'None':np.nan})
cp_df['door_count'] = cp_df.groupby(['Maker', 'model'])['door_count'].transform(lambda x: x.fillna(x.median()))

In [None]:
# Calculate median seat_count grouped by Maker and model
cp_df['seat_count'] = cp_df.groupby(['Maker', 'model'])['seat_count'].transform(lambda x: x.fillna(x.median()))

In [None]:

cp_df['body_type'] = cp_df['body_type'].replace({'compact': 1, 'van': 2})


In [None]:
# Calculate median body_type grouped by Maker and model
cp_df['body_type'] = cp_df.groupby(['Maker', 'model'])['body_type'].transform(lambda x: x.fillna(x.median()))

In [None]:
# # Calculate median engine_power grouped by Maker and model
cp_df['engine_power'] = cp_df.groupby(['Maker', 'model','engine_displacement'])['engine_power'].transform(lambda x: x.fillna(x.median()))


In [None]:
# Null Value Columns replaced
cp_df.isnull().sum()

In [None]:
cp_df[cp_df['engine_power'].isnull()==True]

In [None]:
cp_df[cp_df['body_type'].isnull()==True]

In [None]:
cp_df['body_type'].fillna(cp_df['body_type'].median(), inplace=True)

In [None]:
cp_df['engine_power'].fillna(cp_df['engine_power'].median(), inplace=True)

In [None]:
# Null Value Columns replaced
cp_df.isnull().sum()

In [None]:
#cp_df['Location'] = cp_df['Location'].replace({'Ahmedabad': 1, 'Bangalore': 2, 'Chennai': 3, 'Coimbatore': 4, 'Delhi': 5, 'Hyderabad': 6, 'Jaipur': 7, 'Kochi': 8, 'Kolkata': 9, 'Mumbai': 10, 'Pune': 11})

cp_df['Owner Type'] = cp_df['Owner Type'].replace({'First': 1, 'Second': 2, 'Third': 3, 'Fourth & Above': 4})


In [None]:

#  Convert categorical variable into dummy/indicator variables. As many columns will be created as distinct values
cp_df['Location'] = cp_df['Location'].replace({'Ahmedabad': 1, 'Bangalore': 2, 'Chennai': 3, 'Coimbatore': 4, 'Delhi': 5, 'Hyderabad': 6, 'Jaipur': 7, 'Kochi': 8, 'Kolkata': 9, 'Mumbai': 10, 'Pune': 11})


In [None]:
cp_df['transmission'] = cp_df['transmission'].replace({'man': 0, 'auto': 1})


In [None]:
cp_df['fuel_type'] = cp_df['fuel_type'].replace({'diesel': 0, 'petrol': 1})

In [None]:
cp_df['Maker']=cp_df['Maker'].replace({'skoda':1,'toyota':2,'audi':3,'bmw':4,'nissan':5,'hyundai':6,'fiat':7,'maserati':8})

In [None]:
cp_df['model']=cp_df['model'].replace({ 'octavia':1,'superb':2,'yaris' :3,'qashqai' :4,'x3' :5,'x1' :6,'i30' :7,'q5' :8,'x5':9,'yeti' :10,'panda' :11,'q3' :12,'coupe' :13,'micra' :14,'auris' :15,'avensis' :16,'aygo' :17,'rapid' :18,'roomster': 19,'q7' :20,'citigo':21,'juke' :22,'tt' :23})


In [None]:
cp_df.head(10)

In [None]:
print(cp_df.dtypes)   # Print out the data types of each column

In [None]:
cp_df.describe().transpose()

In [None]:
#sns.pairplot(cp_df, diag_kind='kde')

In [None]:
# reading the CSV file into pandas dataframe
cp_test_df = pd.read_csv("Test_Data_Set.csv")

In [None]:

# Calculate median Distance grouped by Maker, model and manufacture_year
cp_test_df['Distance '] = cp_test_df.groupby(['Maker', 'model','manufacture_year'])['Distance '].transform(lambda x: x.fillna(x.median()))

# Replace Null Values in Column "Distance" with median distance because mean is impacted by outliers
cp_test_df['Distance '].fillna(cp_test_df['Distance '].median(), inplace=True)

cp_test_df['door_count']=cp_test_df['door_count'].replace({'None':np.nan})
# Calculate median door_count grouped by Maker and model
cp_test_df['door_count'] = cp_test_df.groupby(['Maker', 'model'])['door_count'].transform(lambda x: x.fillna(x.median()))

cp_test_df['seat_count']=cp_test_df['seat_count'].replace({'None': np.nan})
# Calculate median seat_count grouped by Maker and model
cp_test_df['seat_count'] = cp_test_df.groupby(['Maker', 'model'])['seat_count'].transform(lambda x: x.fillna(x.median()))


# Replace compact and van Values in Column "body_type" with "1" and "2" respectively
cp_test_df['body_type'] = cp_test_df['body_type'].replace({'compact': 1, 'van': 2})


# Calculate median body_type grouped by Maker and model
cp_test_df['body_type'] = cp_test_df.groupby(['Maker', 'model'])['body_type'].transform(lambda x: x.fillna(x.median()))


cp_test_df['body_type'].fillna(cp_test_df['body_type'].median(), inplace=True)


# # Calculate median engine_power grouped by Maker and model
cp_test_df['engine_power'] = cp_test_df.groupby(['Maker', 'model','engine_displacement'])['engine_power'].transform(lambda x: x.fillna(x.median()))


cp_test_df['engine_power'].fillna(cp_test_df['engine_power'].median(), inplace=True)


cp_test_df['Owner Type'] = cp_test_df['Owner Type'].replace({'First': 1, 'Second': 2, 'Third': 3, 'Fourth & Above': 4})

#  Convert categorical variable into dummy/indicator variables. As many columns will be created as distinct values
cp_test_df['Location'] = cp_test_df['Location'].replace({'Ahmedabad': 1, 'Bangalore': 2, 'Chennai': 3, 'Coimbatore': 4, 'Delhi': 5, 'Hyderabad': 6, 'Jaipur': 7, 'Kochi': 8, 'Kolkata': 9, 'Mumbai': 10, 'Pune': 11})


cp_test_df['transmission'] = cp_test_df['transmission'].replace({'man': 0, 'auto': 1})


cp_test_df['fuel_type'] = cp_test_df['fuel_type'].replace({'diesel': 0, 'petrol': 1})


cp_test_df['Maker']=cp_test_df['Maker'].replace({'skoda':1,'toyota':2,'audi':3,'bmw':4,'nissan':5,'hyundai':6,'fiat':7,'maserati':8})

cp_test_df['model']=cp_test_df['model'].replace({ 'octavia':1,'superb':2,'yaris' :3,'qashqai' :4,'x3' :5,'x1' :6,'i30' :7,'q5' :8,'x5':9,'yeti' :10,'panda' :11,'q3' :12,'coupe' :13,'micra' :14,'auris' :15,'avensis' :16,'aygo' :17,'rapid' :18,'roomster': 19,'q7' :20,'citigo':21,'juke' :22,'tt' :23})

In [None]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = cp_df.drop('Price', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = cp_df[['Price']]

In [None]:
X_cp_test = cp_test_df

In [None]:
# Split X and y into training and test set in 75:25 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
X.head()

In [None]:
# invoke the LinearRegression function and find the bestfit model on training data

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

In [None]:
# Let us explore the coefficients for each of the independent attributes

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

In [None]:
# Let us check the intercept for the model

intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

In [None]:
# we can write our linear model as:
# Y=−21.11–0.35×X1+0.03×X2–0.02×X3–0.01×X4+0.12×X5+0.85×X6–1.90×X7+0.74×X8+1.16×X9

In [None]:
# Model score - R2 or coeff of determinant
# R^2=1–RSS / TSS

regression_model.score(X_test, y_test)

In [None]:
# So the model explains 67% of the variability in Y using X

In [None]:
#  Iteration -2 

#Since on many dimensions, the relationship is not really linear, let us try polynomial models (quadratic)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_ = poly.fit_transform(X_train)

# X_test_ = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train_, y_train)

# y_pred = poly_clf.predict(X_test_)

#print(y_pred)

print(poly_clf.score(X_train_, y_train))

In [None]:
X_test_ = poly.fit_transform(X_cp_test)

y_pred = poly_clf.predict(X_test_)

output = pd.DataFrame({"Id": X_test_[:, 1].flatten(), "Price": y_pred.flatten()})

# output = pd.DataFrame({"Price":y_pred})
output.to_csv('/Users/rajeshvanamala/Documents/Rajesh/Learning/SEinDSPartB/Pre-Requisite/Hackathon/Sample_Submission_Poly.csv',index=False)


In [None]:
print(X_test_.shape)

In [None]:
print(X.shape)
print(X_train_.shape)
poly

In [None]:
# Randon Forest Model prediction steps

from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
# Fitting Random Forest Regression to the dataset
regressor = RandomForestRegressor(n_estimators=32, random_state=0, oob_score=True)
 
# Fit the regressor with x and y data
regressor.fit(X, y)
oob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {oob_score}')
 
predictions = regressor.predict(X)
# Evaluating the model
mse = mean_squared_error(y, predictions)

In [None]:
print(f'Mean Squared Error: {mse}')

In [None]:
print(f'Root Mean Squared Error: {math.sqrt(mse)}')

In [None]:
r2 = r2_score(y, predictions)
print(f'R-squared: {r2}')

In [None]:
rf_y = regressor.predict(X_cp_test)

In [None]:


output = pd.DataFrame({"Id": X_cp_test["Id"], "Price": rf_y.flatten()})

# output = pd.DataFrame({"Price":y_pred})
output.to_csv('/Users/rajeshvanamala/Documents/Rajesh/Learning/SEinDSPartB/Pre-Requisite/Hackathon/Sample_Submission_RF.csv',index=False)


In [None]:
# Decision Tree Model prediction steps

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

model=DecisionTreeRegressor(criterion='squared_error')
model.fit(X_train, y_train)
model.score(X_train, y_train)
model.score(X_test, y_test)  # performance on test data

predictions=model.predict(X)
mse = mean_squared_error(y, predictions)
print(f'Mean Squared Error: {mse}')

In [None]:
print(f'Root Mean Squared Error: {math.sqrt(mse)}')

In [None]:
r2 = r2_score(y, predictions)
print(f'R-squared: {r2}')

In [None]:
dt_y = model.predict(X_cp_test)

In [None]:

output = pd.DataFrame({"Id": X_cp_test["Id"], "Price": dt_y.flatten()})

# output = pd.DataFrame({"Price":y_pred})
output.to_csv('/Users/rajeshvanamala/Documents/Rajesh/Learning/SEinDSPartB/Pre-Requisite/Hackathon/Sample_Submission_DT.csv',index=False)
