In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('50_Startups.csv')

In [11]:
finalData = pd.concat([pd.get_dummies(data.State),data.iloc[:,[0,1,2,4]]] , axis=1)
finalData.head()
data.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


In [6]:
features = finalData.iloc[:,:-1].values
label = finalData.iloc[:,[-1]].values

# Method 1 - Correlation Analysis

In [7]:
# Some Suggestions by Prashant Nair
# 1. Correlation Analysis is only applicable for Numeric Columns at Dataset level
# 2. Decide the Percent Threshold  for Correlation Value . 
#      - Select those features who are having %age greater than equal to 50% (Regression)
#      - For Classification, AVOID using this technique (If your BOSS tells you to perform tgis,
#        then the threshold must be 80% or more)
#
# 3. When using correlation analysis on regression, always backup with Backward elimination for double 
#.   confirmation.
# 4. Helps to clear the problem of multi-collinearity (Identifying multiple features with same  corr value
#.   (Suggestion: 0.xxx must be same))

In [8]:
data.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


In [9]:
# Profit    0.97233.    0.97245.    0.7896.      1
#Create model using R&D and Markg

# Method 2: RFE (Recursive Feature Elimination)

In [8]:
#RFE can be applied only to the following algorithm
#
# 1. Regression (Algorithms that support coeff variables)
#    - LinearRegression
#    - SupportVectorRegressor
#.   - DecisionTreeRegressor
#.   - RandomForestRegressor

# 2. Classification (Algorithms that support feature Importance variable)
#    - DecisionTreeClassifier
#    - RandomForestClassifier


In [9]:
#Steps to apply RFE:
# 1. Initialize the model's algorithm
# 2. Apply RFE to model (ALL FEATURES and Label)
# 3. Get Features with High Ranking

In [28]:
# 1. Initialize the model's algorithm
from sklearn.linear_model import LinearRegression
modelForRFE = LinearRegression()

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
lrModel = DecisionTreeRegressor(max_depth=8)
modelForRFE = lrModel
# 2. Apply RFE to model (ALL FEATURES and Label)
from sklearn.feature_selection import RFE
selectFeaturesRFE = RFE(estimator=modelForRFE,
                       step = 1) #Eliminate Feature One by One at each iteration

selectFeaturesRFE.fit(features,label)

# 3. Get Features with High Ranking
print(finalData.columns)
print(selectFeaturesRFE.ranking_)
print(selectFeaturesRFE.support_)

Index(['California', 'Florida', 'New York', 'R&D Spend', 'Administration',
       'Marketing Spend', 'Profit'],
      dtype='object')
[3 2 4 1 1 1]
[False False False  True  True  True]


  y = column_or_1d(y, warn=True)


In [None]:
#According to RFE, Profit can be determined easily based on company's location (LR)
#Accoring to RFE with DT, Profit can be determined using Florida and R&D data

# Feature Elimination using ANOVA (Univariate Analysis using Anova)

In [20]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
# from sklearn.feature_selection import f_classif
#Regression ----> f_regression
#Classification --> f_classif

#When working on ANOVA, the percentile value must be 50
selectFeaturesANOVA = SelectPercentile(percentile=50, score_func=f_regression)

selectFeaturesANOVA.fit(features,label)

selectFeaturesANOVA.get_support()


  y = column_or_1d(y, warn=True)


array([False, False, False,  True,  True,  True])

In [21]:
#ANOVA says, Maintain R&D,ADM,MARK

# Feature Select By Model

In [23]:
# 1. Initialize the model's algorithm
from sklearn.linear_model import LinearRegression
modelForSBM = LinearRegression()

# 2. Apply SBM to model (ALL FEATURES and Label)
from sklearn.feature_selection import SelectFromModel
selectFeaturesSBM = SelectFromModel(modelForSBM)

selectFeaturesSBM.fit(features,label)

# 3. Get Features with High Ranking
print(finalData.columns)
print(selectFeaturesSBM.get_support())

Index(['California', 'Florida', 'New York', 'R&D Spend', 'Administration',
       'Marketing Spend', 'Profit'],
      dtype='object')
[ True  True  True False False False]


In [30]:
#Final

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

for i in range(1,201):
    X_train,X_test,y_train,y_test = train_test_split(features[:,[0,1,2,3]],
                                                    label,
                                                    test_size=0.2,
                                                    random_state = i)
    
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Test: {} Train: {} RS: {}".format(test_score,train_score,i))

Test: 0.9611102529882972 Train: 0.9387161169637389 RS: 1
Test: 0.9775946857817924 Train: 0.9346763047942375 RS: 2
Test: 0.9559654175007252 Train: 0.942246336812637 RS: 3
Test: 0.9590556616168892 Train: 0.9424803388022351 RS: 4
Test: 0.9724761281808778 Train: 0.9384811616912291 RS: 5
Test: 0.981382877466359 Train: 0.9360316917181242 RS: 10
Test: 0.9525970614685351 Train: 0.9429161309250664 RS: 12
Test: 0.9665035917005049 Train: 0.9382536651204462 RS: 14
Test: 0.9458552878814129 Train: 0.9448618299320201 RS: 20
Test: 0.9598546868412196 Train: 0.9424357627200556 RS: 21
Test: 0.96754829785172 Train: 0.9393665190660465 RS: 22
Test: 0.9584704147342747 Train: 0.9432925534944119 RS: 24
Test: 0.9628182291505862 Train: 0.939518924777711 RS: 26
Test: 0.9523021832223025 Train: 0.9429070380091912 RS: 29
Test: 0.9452326715138092 Train: 0.940117391975616 RS: 31
Test: 0.9559154260857984 Train: 0.9307968670535005 RS: 34
Test: 0.9561023815459189 Train: 0.9429880672597816 RS: 38
Test: 0.9618335153549322 

In [33]:

lrModel = LinearRegression()

lrModel.fit(X_train,y_train)
#Lets Explore the equation

print("Co-efficients: ")
print(lrModel.coef_)
print("Intercept: ")
print(lrModel.intercept_)
# 6. Check the quality of the model

# We use accuracy check as a mechanism to check the quality of the model

print ( " Train score %r " % lrModel.score(X_train,y_train))
# To ensure our model quality is GOOD, ensure your model performs well with Unknown data
print ( " Test score %r " %lrModel.score(X_test,y_test))

Co-efficients: 
[[-9.87206147e+02  1.11121257e+03 -1.24006423e+02  8.66830052e-01]]
Intercept: 
[46551.37128798]
 Train score 0.9532121595946517 
 Test score 0.8860586680822431 
