## Project 2 :- Startup Profit Prediction 

### Importing the Dependencies

In [63]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

### Data Collection and Processing

In [29]:
# loading the data from csv file to pandas dataframe
startup_dataset=pd.read_csv("startups.csv")
print(startup_dataset)

    R&D Spend  Administration  Marketing Spend       State     Profit
0   165349.20       136897.80        471784.10    New York  192261.83
1   162597.70       151377.59        443898.53  California  191792.06
2   153441.51       101145.55        407934.54     Florida  191050.39
3   144372.41       118671.85        383199.62    New York  182901.99
4   142107.34        91391.77        366168.42     Florida  166187.94
5   131876.90        99814.71        362861.36  California  156991.12
6   134615.46       147198.87        127716.82    New York  156122.51
7   130298.13       145530.06        323876.68     Florida  155752.60
8   120542.52       148718.95        311613.29  California  152211.77
9   123334.88       108679.17        304981.62     Florida  149759.96
10  101913.08       110594.11        229160.95    New York  146121.95
11  100671.96        91790.61        249744.55  California  144259.40
12   93863.75       127320.38        249839.44     Florida  141585.52
13   91992.39       

In [5]:
# Inspecting the first 5 rows of the dataframe
startup_dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
#Inspecting the last 5 rows of the dataframe
startup_dataset.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
14,119943.24,156547.42,256512.92,California,132602.65
15,114523.61,122616.84,261776.23,Florida,129917.04
16,78013.11,121597.55,264346.06,New York,126992.93
17,94657.16,145077.58,282574.31,California,125370.37
18,91749.16,114175.79,294919.57,Florida,124266.9


In [6]:
# checking the number of rows and columns
startup_dataset.shape

(19, 5)

In [7]:
# getting some information about the dataframe
startup_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        19 non-null     float64
 1   Administration   19 non-null     float64
 2   Marketing Spend  19 non-null     float64
 3   State            19 non-null     object 
 4   Profit           19 non-null     float64
dtypes: float64(4), object(1)
memory usage: 892.0+ bytes


In [8]:
# checking the number of missing values
startup_dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

### Encoding the Categorical Data

In [13]:
# To check Unique Values
print(startup_dataset["State"].unique())

['New York' 1 2]


In [65]:
pd.set_option('future.no_silent_downcasting', True) # (just ignoring warnings for now)
startup_dataset["State"] = startup_dataset["State"].replace({
    "New York": 0,
    "California": 1,
    "Florida": 2
})
print(startup_dataset)

    R&D Spend  Administration  Marketing Spend  State     Profit
0   165349.20       136897.80        471784.10      0  192261.83
1   162597.70       151377.59        443898.53      1  191792.06
2   153441.51       101145.55        407934.54      2  191050.39
3   144372.41       118671.85        383199.62      0  182901.99
4   142107.34        91391.77        366168.42      2  166187.94
5   131876.90        99814.71        362861.36      1  156991.12
6   134615.46       147198.87        127716.82      0  156122.51
7   130298.13       145530.06        323876.68      2  155752.60
8   120542.52       148718.95        311613.29      1  152211.77
9   123334.88       108679.17        304981.62      2  149759.96
10  101913.08       110594.11        229160.95      0  146121.95
11  100671.96        91790.61        249744.55      1  144259.40
12   93863.75       127320.38        249839.44      2  141585.52
13   91992.39       135495.07        283034.71      0  134307.35
14  119943.24       15654

In [62]:
print(startup_dataset["State"].unique())

[0 1 2]


### Splitting the data and Target

In [42]:
x=startup_dataset.drop(["Profit"],axis=1)
y=startup_dataset["Profit"]

In [25]:
print(x)

    R&D Spend  Administration  Marketing Spend  State
0   165349.20       136897.80        471784.10      0
1   162597.70       151377.59        443898.53      1
2   153441.51       101145.55        407934.54      2
3   144372.41       118671.85        383199.62      0
4   142107.34        91391.77        366168.42      2
5   131876.90        99814.71        362861.36      1
6   134615.46       147198.87        127716.82      0
7   130298.13       145530.06        323876.68      2
8   120542.52       148718.95        311613.29      1
9   123334.88       108679.17        304981.62      2
10  101913.08       110594.11        229160.95      0
11  100671.96        91790.61        249744.55      1
12   93863.75       127320.38        249839.44      2
13   91992.39       135495.07        283034.71      0
14  119943.24       156547.42        256512.92      1
15  114523.61       122616.84        261776.23      2
16   78013.11       121597.55        264346.06      0
17   94657.16       145077.5

In [26]:
print(y)

0     192261.83
1     191792.06
2     191050.39
3     182901.99
4     166187.94
5     156991.12
6     156122.51
7     155752.60
8     152211.77
9     149759.96
10    146121.95
11    144259.40
12    141585.52
13    134307.35
14    132602.65
15    129917.04
16    126992.93
17    125370.37
18    124266.90
Name: Profit, dtype: float64


In [33]:
print(x.shape)  
print(y.shape) 

(19, 4)
(19,)


### Splitting Training and Test data

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2)


### Model Training

In [48]:
linear_regression_model=LinearRegression()
linear_regression_model.fit(x_train,y_train)


### Model Evaluation

In [55]:
# prediction on training data
train_data_prediction=linear_regression_model.predict(x_train)
print(train_data_prediction)

[143789.31595268 196374.62408843 142581.4503854  169496.97697259
 180061.28631018 185828.26921887 126110.95767789 152946.57402254
 129005.72171317 124774.38143278 183183.57394588 142375.92103853
 153761.9158715  129580.66293014 133170.79129671 142766.57080856
 148699.38633414]


In [52]:
# prediction on test data
test_data_prediction=linear_regression_model.predict(x_test)
print(test_data_prediction)

[154273.20727363 174671.15247161]


### Performance Metrics

In [57]:
# for training data
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Calculate errors
mae = mean_absolute_error(y_train, train_data_prediction)
mse = mean_squared_error(y_train, train_data_prediction)
rmse = np.sqrt(mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 5835.7602900669835
Mean Squared Error (MSE): 52828234.161336906
Root Mean Squared Error (RMSE): 7268.303389466961


In [59]:
# dor testing data
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
mae=mean_absolute_error(y_test,test_data_prediction)
mse=mean_squared_error(y_test,test_data_prediction)
rmse=np.sqrt(mse)
print("Mean Absolute Error (MAE):",mae)
print("Mean Squared Error (MSE):",mse)
print("Root Mean Squared Error (RMSE):",rmse)

Mean Absolute Error (MAE): 6498.229872620737
Mean Squared Error (MSE): 46167147.395709276
Root Mean Squared Error (RMSE): 6794.641079240998


### Example :- Making prediction on new data

In [60]:
input_data={
    "R&D Spend":[120000],
    "Administration":[100000],
    "Marketing Spend":[150000],
    "State":1
}
input_df=pd.DataFrame(input_data)
predicted_profit=linear_regression_model.predict(input_df)
print("Predicted Profit :- ",predicted_profit[0])
    

Predicted Profit :-  149587.54197607783
