In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neural_network import MLPRegressor


In [55]:
data = pd.read_csv("50_Startups.csv")


In [56]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,165349.2,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [57]:
data.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
45,1000.23,124153.04,1903.93,64926.08
46,1315.46,115816.21,297114.46,49490.75
47,0.0,135426.92,0.0,42559.73
48,542.05,51743.15,0.0,35673.41
49,0.0,116983.8,45173.06,14681.4


In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   Profit           50 non-null     float64
dtypes: float64(4)
memory usage: 1.7 KB


In [59]:
data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [60]:
data.shape

(50, 4)

In [61]:

X = data[['R&D Spend',	'Administration',	'Marketing Spend']]  
y = data['Profit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()



model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

MAE: 6979.1522523704
MSE: 80926321.2229516
RMSE: 8995.905803361416
R2: 0.900065308303732


In [62]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'MAE: {mae_rf}')
print(f'MSE: {mse_rf}')
print(f'RMSE: {rmse_rf}')
print(f'R2: {r2_rf}')

MAE: 6437.497739999977
MSE: 72625008.62306513
RMSE: 8995.905803361416
R2: 0.9103164738430438


In [63]:
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'MAE: {mae_xgb}')
print(f'MSE: {mse_xgb}')
print(f'RMSE: {rmse_xgb}')
print(f'R2: {r2_xgb}')

MAE: 7335.841953125002
MSE: 94578365.21962246
RMSE: 8995.905803361416
R2: 0.8832066053846593


In [64]:

svr_model = SVR(kernel='rbf')  

svr_model.fit(X_train, y_train)

y_pred_svr = svr_model.predict(X_test)

mae_svr = mean_absolute_error(y_test, y_pred_svr)
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = np.sqrt(mse)
r2_svr = r2_score(y_test, y_pred_svr)

print(f'MAE: {mae_svr}')
print(f'MSE: {mse_svr}')
print(f'RMSE: {rmse_svr}')
print(f'R2: {r2_svr}')

MAE: 22844.10930188185
MSE: 955479565.0702635
RMSE: 8995.905803361416
R2: -0.1799072825060064


In [65]:
nn_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

nn_model.fit(X_train, y_train)

y_pred_nn = nn_model.predict(X_test)

mae_nn = mean_absolute_error(y_test, y_pred_nn)
mse_nn = mean_squared_error(y_test, y_pred_nn)
rmse_nn = np.sqrt(mse)
r2_nn = r2_score(y_test, y_pred_nn)

print(f'MAE: {mae_nn}')
print(f'MSE: {mse_nn}')
print(f'RMSE: {rmse_nn}')
print(f'R2: {r2_nn}')

MAE: 22538.078420527276
MSE: 768361286.0862529
RMSE: 8995.905803361416
R2: 0.051162253813160086


In [67]:
print("The Best Model is Random Forest.")

The Best Model is Random Forest.
