# Prediction of the profits of the varies startup companies based on some of their features

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt

## Importing the data

In [None]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


## Extract features and the independent variable from the data.

In [None]:
x = data.drop('Profit', axis=1).values
y = data.iloc[:, -1].values

## Plotting the whole data frame

In [60]:
import plotly.express as px
core_data_fig = px.scatter(data, x="Marketing Spend", symbol='State', y='R&D Spend', color="Administration", size='Profit')
core_data_fig.update_layout(coloraxis_colorbar=dict(
        thickness=30,
        len=.85,
        x=1.05,
        y=.35
    ))
core_data_fig.show()

## Turn categorical data into numeric form

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

## Spliting data into training and test set.

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
x_test.shape

(10, 6)

## Training a model with the data

In [None]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

In [None]:

y_pred_train = linear_model.predict(x_train)
y_pred_test = linear_model.predict(x_test)
y_pred_test

array([103015.20159795, 132582.27760816, 132447.73845175,  71976.09851258,
       178537.48221057, 116161.24230167,  67851.69209676,  98791.73374687,
       113969.43533014, 167921.06569552])

## Model evaluation

In [None]:
np.printoptions(precision=2)
y_pred = linear_model.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred_test), 1), y_test.reshape(len(y_test), 1)), axis=-1))

[[103015.20159795 103282.38      ]
 [132582.27760816 144259.4       ]
 [132447.73845175 146121.95      ]
 [ 71976.09851258  77798.83      ]
 [178537.48221057 191050.39      ]
 [116161.24230167 105008.31      ]
 [ 67851.69209676  81229.06      ]
 [ 98791.73374687  97483.56      ]
 [113969.43533014 110352.25      ]
 [167921.06569552 166187.94      ]]


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

print(r2_score(y_test, y_pred_test))
print(mean_absolute_error(y_test, y_pred_test))

0.9347068473282546
7514.29365964318


In [62]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter3d(y=x_train[:,4], z=y_pred_train, x=x_train[:, 3], mode='markers', marker=dict(
        size=8,
        color=x_train[:, 5],
        colorscale='Viridis',
        colorbar=dict(title='Feature 0',
        thickness=10,
        len=0.4,
        x=1,
        y=.55),
        opacity=0.8
    )))
fig.add_trace(go.Scatter3d(y=x_train[:,4], z=y_train, x=x_train[:, 3], mode='markers', marker=dict(
        size=8,
        color=x_train[:, 5],
        colorscale='Magma',
        colorbar=dict(title='Feature 1',
        thickness=10,
        len=0.4,
        x=1.1,
        y=.55),
        opacity=0.8
    )))
fig.add_trace(go.Scatter3d(y=x_test[:,4], z=y_pred_test, x=x_test[:, 3], mode='markers', marker=dict(
        size=8,
        color=x_test[:, 5],
        colorscale='Oranges',
        colorbar=dict(title='Feature 2',
        thickness=10,
        len=0.4,
        x=1.1,
        y=0.1),
        opacity=0.8
    )))
fig.add_trace(go.Scatter3d(y=x_test[:,4], z=y_test, x=x_test[:, 3], mode='markers', marker=dict(
        size=8,
        color=x_test[:, 5],
        colorscale='Blues',
        # colorbar=dict(title='Feature 4'),
        opacity=0.8,
        colorbar=dict(title='Feature 3',
        thickness=10,
        len=0.4,
        x=1,
        y=0.1
)
    )))
fig.show()

In [None]:
# Making prediction on a data with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = California
linear_model.predict([[0, 0, 1, 160000, 130000, 300000]])

array([182266.29294638])

In [None]:
# get the constant of the model
print(linear_model.coef_)
print(linear_model.intercept_)

[ 8.66383692e+01 -8.72645791e+02  7.86007422e+02  7.73467193e-01
  3.28845975e-02  3.66100259e-02]
42467.52924853278
