In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# State - char variable (Encoding is required)

# Pre-processing
# part 1 : handling missing value
# part 2 : handling encoding part (label encoder, one hot encoder, dummy)
# part 3 : handling outlier 
# part 4 : feature scaling - standarisation and normalisation
# part 5 : handling imbalance dataset

In [4]:
# part 1 : handling missing value

df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [5]:
# part 2 : handling encoding part (label encoder, one hot encoder, dummy)

# label encoder
df['State'] = df['State'].astype('category')
df['State'] = df['State'].cat.codes

In [6]:
df['State'].value_counts()

2    17
0    17
1    16
Name: State, dtype: int64

In [7]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [8]:
# one hot encoder
df = pd.get_dummies(df,columns=['State'])

In [9]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_0,State_1,State_2
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


In [None]:
# Dummy Variables - n-1
# Removing 1 state column to avoid multicollinearity

In [10]:
df = df.drop(columns=['State_0'])

In [11]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_1,State_2
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [None]:
# separating indepedent and depedent variables

x = df.drop(columns=['Profit'])
y = df['Profit']

print(x.head())
print('\n')
print(y.head())

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.75, random_state=101)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.75,random_state=501)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(37, 5)
(13, 5)
(37,)
(13,)


# Decision Tree Regressor

In [18]:
from sklearn.tree import DecisionTreeRegressor
DT_Reg = DecisionTreeRegressor()
DT_Model = DT_Reg.fit(x_train, y_train)

In [19]:
y_train_pred = DT_Model.predict(x_train)
y_test_pred = DT_Model.predict(x_test)

In [23]:
from sklearn.metrics import r2_score

print(r2_score(y_train,y_train_pred ))
print('\n')
print(r2_score(y_test,y_test_pred))

1.0


0.8746278184383132


# RandomForest Regressor

In [24]:
from sklearn.ensemble import RandomForestRegressor
RF_reg = RandomForestRegressor()
RF_mod = RF_reg.fit(x_train,y_train)

In [37]:
y_predict_train_RF = RF_mod.predict(x_train)
y_predict_test_RF = RF_mod.predict(x_test)

In [39]:
print(r2_score(y_train, y_predict_train_RF))
print("\n")
print(r2_score(y_test, y_predict_test_RF))

0.9892170275985217


0.8619989085156641


# Linear Regression

In [40]:
from sklearn.linear_model import LinearRegression
lm_mod = LinearRegression()
lr = lm_mod.fit(x_train,y_train)

In [41]:
y_predict_train_lr = lr.predict(x_train)
y_predict_test_lr = lr.predict(x_test)

In [44]:
print(f'Train Accuracy :', r2_score(y_train,y_predict_train_lr))
print('\n')
print(f'Test Accuracy :', r2_score(y_test,y_predict_test_lr))



Train Accuracy : 0.960389517551679


Test Accuracy : 0.8242696380420215


# conclude :

### Decision Tree Regressor : Accuracy ****************
##### Train : 100%
##### Test : 87.45%


### Random Forest Regressor : Accuracy
##### Train : 99%
##### Test : 86.19%

### Linear Regression : Accuracy
##### Train : 96%
##### Test : 82.4%