In [None]:
import warnings
warnings.filterwarnings('ignore')

### Reading and Understanding the Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
startups = pd.read_csv("../input/startup-logistic-regression/50_Startups.csv")
startups.head()

In [None]:
startups.shape

In [None]:
startups.info()

In [None]:
# Statistical summary
'''As you can see that target varaible "Profit" that contains outlier.'''
startups.describe()

In [None]:
# Null-value percentage
(startups.isnull().sum()/startups.shape[0]) *100

### Check for Outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.boxplot(startups["R&D Spend"])

In [None]:
sns.boxplot(startups["Administration"])

In [None]:
sns.boxplot(startups["Marketing Spend"])

Inference: There are no outliers for predictor variable.

    X = ["Administration","Marketing Spend","R&D Spend"]

In [None]:
'''Target variable contains an outlier'''
sns.boxplot(startups["Profit"])

In [None]:
'''Remove outliers are target variable'''
Q3, Q1 = np.percentile(startups["Profit"], [75 ,25])
IQR = Q3 - Q1
IQR

In [None]:
# Remove outlier
startups = startups[~(startups.Profit< (Q1 - 1.5*IQR))]

In [None]:
sns.boxplot(startups["Profit"])

In [None]:
# Recheck the dimention of the dataframe
startups.shape

In [None]:
# Statistical summary for modified dataframe
startups.describe()

#### Visualising Numeric Variables

In [None]:
# Numerical columns
sns.pairplot(startups, kind="reg", diag_kind="kde",  hue="State")
plt.show()

<b>Inference:</b> 
    The features- "R&D Spend" and "Marketing Spend" are in linear relation with target variable.

#### Visualising Categorical Variables

In [None]:
California = startups[startups.State == "California"]
Florida    = startups[startups.State == "Florida"]
New_York   = startups[startups.State == "New York"]

In [None]:
Profit_California = round(California.Profit.mean(), 2)
Profit_Florida    = round(Florida.Profit.mean(), 2)
Profit_New_York   = round(New_York.Profit.mean(), 2)

ax = sns.barplot(x="State", y="Profit", data=startups)

print(f"Average profit in California(in million$) = ${round(Profit_California/1000000,3)}")
print(f"Average profit in Florida   (in million$) = ${round(Profit_Florida/1000000, 3)}")
print(f"Average profit in New_York  (in million$) = ${round(Profit_New_York/1000000, 3)}")

<b>Inference:</b> On an average return profits are maximum at Florida than other states.

In [None]:
Administration_California = round(California.Administration.mean(), 2)
Administration_Florida    = round(Florida.Administration.mean(), 2)
Administration_New_York   = round(New_York.Administration.mean(), 2)

ax = sns.barplot(x="State", y="Profit", data=startups)

print(f"Average company's administration in California(in million$) = ${round(Administration_California/1000000, 3)}")
print(f"Average company's administration in Florida   (in million$) = ${round(Administration_Florida/1000000, 3)}")
print(f"Average company's administration in New_York  (in million$) = ${round(Administration_New_York/1000000, 3)}")

<b>Inference:</b> On an average company's administration is same for all states.

In [None]:
Marketing_California = round(California["Marketing Spend"].mean(), 2)
Marketing_Florida    = round(Florida["Marketing Spend"].mean(), 2)
Marketing_New_York   = round(New_York["Marketing Spend"].mean(), 2)

ax = sns.barplot(x="State", y="Profit", data=startups)

print(f"Average marketing in California(in million$) = ${round(Marketing_California/1000000, 3)}")
print(f"Average marketing in Florida   (in million$) = ${round(Marketing_Florida/1000000, 3)}")
print(f"Average marketing in New_York  (in million$) = ${round(Marketing_New_York/1000000, 3)}")

<b>Inference:</b> On an average marketing expense are minimum at California than other states.

### Data Preparation

In [None]:
startups["State"].unique()

In [None]:
startups = pd.get_dummies(startups, drop_first=True)
startups.rename(columns={"R&D Spend":"R&D", "Marketing Spend":"Marketing", 
                         "State_Florida":"Florida", "State_New York":"New York"}, inplace=True)

In [None]:
startups.head()

### Visualise the entire dataset

In [None]:
sns.pairplot(startups, kind="reg", diag_kind="kde")
plt.show()

In [None]:
startups.corr()

<b>Inference:</b> 

     1. At "New York" profit returns are worse and startup, might end up in loss and its marketing expenditure is 
        high than other states.
     2. Company profit is higly dependent on the R&D expense as R&D expense is highly correlated with Profit.
     3. Profits from the states doesn't make a significant change to the company.
     4. Company spends more on R&D, quality of marketing for the comapany increases. Hence, increase in overall profit.

### Splitting the Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0)
startups_train, startups_test = train_test_split(startups, train_size=0.67, test_size=0.33, random_state=42)

In [None]:
startups_train.shape

In [None]:
startups_test.shape

### Rescaling the Features

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
'''There is no multi-collinearity between the predictor variables'''
plt.figure(figsize=[8,5])
sns.heatmap(startups_train.corr(), annot = True, cmap="RdYlGn", center=0.0)
plt.show()

In [None]:
# standard the numerical columns
num_col = ["R&D","Administration","Marketing","Profit"]
# num_col = startups_train.iloc[:,-3:]

startups_train[num_col] = scaler.fit_transform(startups_train[num_col])
startups_train.reset_index(drop=True, inplace=True)
startups_train.head()

### Dividing into X and Y sets for the model building

In [None]:
y_train = startups_train.pop("Profit")
X_train = startups_train

### Building a linear model

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# VIF values of the feature variables:
def VIF(X):
    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X.shape[1])]
    vif["VIF"] = round(vif["VIF"], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return vif

In [None]:
X_train_1 = X_train[["R&D"]]
X_train_lm = sm.add_constant(X_train_1)
lr_1 = sm.OLS(y_train, X_train_lm).fit()
print(lr_1.summary())

VIF(X_train_1)

In [None]:
X_train_2 = X_train[["R&D","Marketing"]]
X_train_lm2 = sm.add_constant(X_train_2)
lr_2 = sm.OLS(y_train, X_train_lm2).fit()
print(lr_2.summary())

VIF(X_train_2)

<b>Inference:</b>
<!--  -->
    1. The p-value is significant for co-efficients of predictor variable and constant of the regression line. So 
    the association is not purely by chance.
    2. Prob (F-statistic) tells the model fit is statistically significant, and the explained variance isn't purely 
    by  chance.

### Dropping the Variable and Updating the Model
<!--  -->
     As some of the variable have high p-values. Such variables are insignificant and should be dropped.
     Hence, the model is stable for R&D, Marketing.

### Residual analysis 

In [None]:
Predicted_profit  = lr_2.predict(X_train_lm2)
residuals = y_train - Predicted_profit

Act_pred = pd.DataFrame(data={"Predicted":Predicted_profit, "Actual":y_train, "Residual Error":round(residuals,3)})
Act_pred.reset_index(drop=True, inplace=True)
Act_pred.head()

In [None]:
# Q-Q plot
'''The Q-Q plot infers that the residuals meets condition of homoscedasticity'''
import scipy.stats as stats

fig = sm.qqplot(residuals, stats.t, fit=True, line="45")
fig.suptitle("Error Terms", fontsize = 20)    
plt.show()

In [None]:
'''The errors should not follow any pattern and equally distributed y=0(i.e; mean=0.000)'''
plt.scatter(y_train, residuals)
plt.axhline(y=0.0, color='r', linestyle='-')
plt.show()

### Making Predictions Using the Final Model

In [None]:
startups_test[num_col] = scaler.transform(startups_test[num_col])
startups_test.head()

In [None]:
startups_test.describe()

### Dividing into X_test and y_test

In [None]:
y_test = startups_test.pop("Profit")
X_test = startups_test

In [None]:
X_test_lm2 = sm.add_constant(X_test)
X_test_lm2 = X_test_lm2.drop(["Administration","Florida","New York"], axis = 1).reset_index(drop=True)
X_test_lm2.head()

In [None]:
# Making predictions using the second model
Predicted_Profit = lr_2.predict(X_test_lm2)

In [None]:
fig = plt.figure()
plt.scatter(y_test, Predicted_Profit)
fig.suptitle('y_test vs y_pred', fontsize = 20)              # Plot heading 
plt.xlabel('y_test', fontsize = 18)                          # X-label
plt.ylabel('y_pred', fontsize = 16)   
plt.show()

In [None]:
#Actual vs Predicted - Test dataset.
c = [i for i in range(1,18,1)]
fig = plt.figure()
plt.plot(c,y_test, color="blue", linewidth=2.5, linestyle="-")
plt.plot(c,Predicted_Profit, color="red", linewidth=2.5, linestyle="-")
fig.suptitle("Actual and Predicted", fontsize=20)              # Plot heading 
plt.xlabel("Index", fontsize=18)                               # X-label
plt.ylabel("Profit", fontsize=16)                               # Y-label
plt.show()

###  Model Evaluation

In [None]:
# Evaluvate using r-squared metrics
from sklearn.metrics import r2_score

r2 = r2_score(y_test,Predicted_Profit)
round(r2, 4)

In [None]:
# Check the mean error for predicted value and actual value
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, Predicted_Profit)
print(f"Mean Squared Error = {round(mse, 4)}")

In [None]:
# Adjusted R-squared

'''n is number of rows in X_test'''
n = X_test_lm2.shape[0]

'''Number of features'''
p = X_test_lm2.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
round(adjusted_r2, 4)

In [None]:
mse_percent = round(mse, 4)
r2_percent = round(r2*100, 2)
adjusted_r2_percent = round(adjusted_r2*100, 2)
result = pd.DataFrame(data={"MSE%":[mse_percent], "R-squared%":[r2_percent], 
                            "Adjusted R-squared%":[adjusted_r2_percent]})
result

## Equation of model

In [None]:
# Parameters from regression model
parameter = lr_2.params

const  = round(parameter[0], 2)
coeff1 = round(parameter[1], 2)
coeff2 = round(parameter[2], 2)
eqn = pd.DataFrame(data={"coeff Constant":[const], "coeff R&D":[coeff1], "coeff Marketing":[coeff2]})
eqn

### Gradient descent

In [None]:
# Convert X and y to arrays
import numpy as np
X = X_train_lm2
y = y_train

X = np.array(X)
y = np.array(y)

In [None]:
# Theta is the vector representing coefficients (intercept, area, bedrooms)
theta = np.matrix(np.array([0,0,0])) 
alpha = 0.5
iterations = 1000

In [None]:
def compute_cost(X, y, theta):
    return np.sum(np.square(np.matmul(X, theta) - y)) / (2 * len(y))

In [None]:
# gradient descent
'''Takes in current X, y, learning rate alpha, num_iters
    returns cost (notice it uses the cost function defined above)'''

def gradient_descent_multi(X, y, theta, alpha, iterations):
    theta = np.zeros(X.shape[1])
    m = len(X)
    gdm_df = pd.DataFrame( columns = ['Bets','cost'])

    for i in range(iterations):
        gradient = (1/m) * np.matmul(X.T, np.matmul(X, theta) - y)
        theta = theta - alpha * gradient
        cost = compute_cost(X, y, theta)
        gdm_df.loc[i] = [theta,cost]

    return gdm_df

In [None]:
# print costs with various values of coefficients b0, b1, b2
gradient_descent_multi(X, y, theta, alpha, iterations)

In [None]:
Coeff = gradient_descent_multi(X, y, theta, alpha, iterations).iloc[999,:]

In [None]:
const  = round(Coeff[0][0], 2)
coeff1 = round(Coeff[0][1], 2)
coeff2 = round(Coeff[0][2], 2)
eqn_gradient = pd.DataFrame(data={"coeff Constant":[const], "coeff R&D":[coeff1], "coeff Marketing":[coeff2]})
eqn_gradient

<b>Inference:</b> As the model coefficients obtained are same in both cases.

$ Profit =  0.08 + 0.79  \times  RD + 0.12  \times  Marketing $