<a href="https://colab.research.google.com/github/roshk10/startup-profit-prediction/blob/main/notebooks/startup_profit_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/ML_Projects startup-profit-prediction data/50_Startups.csv"
df = pd.read_csv(file_path)

df.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [None]:
df.shape

(50, 5)

In [None]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [None]:
# Separate input features and target variable
X = df.drop('Profit', axis=1)
y = df['Profit']

X.head(), y.head()


(   R&D Spend  Administration  Marketing Spend       State
 0  165349.20       136897.80        471784.10    New York
 1  162597.70       151377.59        443898.53  California
 2  153441.51       101145.55        407934.54     Florida
 3  144372.41       118671.85        383199.62    New York
 4  142107.34        91391.77        366168.42     Florida,
 0    192261.83
 1    191792.06
 2    191050.39
 3    182901.99
 4    166187.94
 Name: Profit, dtype: float64)

In [None]:
X.dtypes


Unnamed: 0,0
R&D Spend,float64
Administration,float64
Marketing Spend,float64
State,object


In [None]:
# One-hot encode the 'State' column
X_encoded = pd.get_dummies(X, drop_first=True)

X_encoded.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,False,True
1,162597.7,151377.59,443898.53,False,False
2,153441.51,101145.55,407934.54,True,False
3,144372.41,118671.85,383199.62,False,True
4,142107.34,91391.77,366168.42,True,False


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
    )

X_train.shape, X_test.shape



((40, 5), (10, 5))

In [None]:
from sklearn.linear_model import LinearRegression

# Create Linear Regression model
model_lr = LinearRegression()

# Train the model
model_lr.fit(X_train, y_train)


In [None]:
# Predict profit for test data
y_pred = model_lr.predict(X_test)

y_pred[:5]


array([126362.87908255,  84608.45383634,  99677.49425147,  46357.46068582,
       128750.48288504])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

mae, rmse, r2


(6961.477813252376, np.float64(9055.957323458464), 0.8987266414328637)

In [None]:
from sklearn.linear_model import Ridge

# Create Ridge model
model_ridge = Ridge(alpha=1.0)

# Train Ridge model
model_ridge.fit(X_train, y_train)

# Predict
y_pred_ridge = model_ridge.predict(X_test)


In [None]:
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

mae_ridge, rmse_ridge, r2_ridge


(6963.340034795974, np.float64(9049.186353499541), 0.8988780252113923)

In [None]:
from sklearn.linear_model import Lasso

# Create Lasso model
model_lasso = Lasso(alpha=0.1)

# Train Lasso model
model_lasso.fit(X_train, y_train)

# Predict
y_pred_lasso = model_lasso.predict(X_test)


In [None]:
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)

mae_lasso, rmse_lasso, r2_lasso


(6961.487870576682, np.float64(9055.92322044301), 0.8987274041838984)

In [None]:
import pandas as pd

results = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'MAE': [mae, mae_ridge, mae_lasso],
    'RMSE': [rmse, rmse_ridge, rmse_lasso],
    'R2 Score': [r2, r2_ridge, r2_lasso]
                })

results


Unnamed: 0,Model,MAE,RMSE,R2 Score
0,Linear Regression,6961.477813,9055.957323,0.898727
1,Ridge Regression,6963.340035,9049.186353,0.898878
2,Lasso Regression,6961.487871,9055.92322,0.898727


In [None]:
feature_importance = pd.DataFrame({
      'Feature': X_encoded.columns,
          'Coefficient': model_lr.coef_
          })
feature_importance.sort_values(by='Coefficient', ascending=False)



Unnamed: 0,Feature,Coefficient
3,State_Florida,938.793006
4,State_New York,6.98776
0,R&D Spend,0.80563
2,Marketing Spend,0.029855
1,Administration,-0.068788
