In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Profit.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [5]:
df.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
# Create dummy variables for the 'State' column, dropping the first category to avoid multicollinearity
states = pd.get_dummies(df['State'], drop_first=True)

# Ensure dummy variables are in 0 and 1 format
states = states.astype(int)

# Drop the original 'State' column from the dataframe
df = df.drop('State', axis=1)

# Concatenate the dummy variables to the original dataframe
df = pd.concat([df, states], axis=1)

# Display the updated dataframe
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [8]:
X = df[['R&D Spend','Administration','Marketing Spend','Florida','New York']]
X

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0
5,131876.9,99814.71,362861.36,0,1
6,134615.46,147198.87,127716.82,0,0
7,130298.13,145530.06,323876.68,1,0
8,120542.52,148718.95,311613.29,0,1
9,123334.88,108679.17,304981.62,0,0


In [9]:
y= df[['Profit']]
y

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94
5,156991.12
6,156122.51
7,155752.6
8,152211.77
9,149759.96


In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [11]:
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)

In [12]:
y_pred = linear_model.predict(X_test)
y_pred

array([[ 74061.28471141],
       [ 46009.23798767],
       [ 99637.26360771],
       [155786.53229368],
       [127636.76349541],
       [192765.18597815],
       [ 63906.99972432],
       [ 54935.14415867],
       [ 84532.35238003],
       [109460.29711995]])

In [13]:
r2_score(y_test,y_pred)

0.9783259006628308

In [14]:
# Manual input for prediction
# For example, R&D Spend: 160000, Administration: 130000, Marketing Spend: 300000, State: Florida
manual_input = pd.DataFrame({
    'R&D Spend': [160000],
    'Administration': [130000],
    'Marketing Spend': [300000],
    'Florida': [1],  # 1 if state is Florida, else 0
    'New York': [0]  # 1 if state is New York, else 0
})

# Predict the output for the manual input
manual_pred = linear_model.predict(manual_input)
print("Predicted Profit for manual input:", manual_pred[0])

Predicted Profit for manual input: [183399.49990366]


In [15]:
# Manual input for prediction
manual_input_array = np.array([360000, 230000, 250000, 0, 1]).reshape(1, -1)

# Convert the numpy array to a DataFrame with the correct column names
manual_input = pd.DataFrame(manual_input_array, columns=['R&D Spend', 'Administration', 'Marketing Spend', 'Florida', 'New York'])

# Predict the output for the manual input
manual_pred = linear_model.predict(manual_input)
print("Predicted Profit for manual input:", manual_pred[0])

Predicted Profit for manual input: [340738.52230514]


In [16]:
# Displaying coefficients
print("Coefficients(M): ", linear_model.coef_)
print("Intercept (C): ", linear_model.intercept_)

Coefficients(M):  [[ 8.13230566e-01 -4.65775420e-02  2.69909892e-02 -1.07669797e+03
  -3.76485056e+02]]
Intercept (C):  [52317.09103701]


### Using StandardScaler Approach

In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv('Profit.csv')

# Create dummy variables for the 'State' column, dropping the first category to avoid multicollinearity
states = pd.get_dummies(df['State'], drop_first=True)

# Ensure dummy variables are in 0 and 1 format
states = states.astype(int)

# Drop the original 'State' column from the dataframe
df = df.drop('State', axis=1)

# Concatenate the dummy variables to the original dataframe
df = pd.concat([df, states], axis=1)

# Define the feature set and target variable
X = df[['R&D Spend', 'Administration', 'Marketing Spend', 'Florida', 'New York']]
y = df['Profit']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict the output for the test set
y_pred = linear_model.predict(X_test)

print(y_pred)
r2_score(y_test,y_pred)

[ 74061.28471133  46009.2379876   99637.26360759 155786.53229373
 127636.76349538 192765.18597814  63906.99972423  54935.14415853
  84532.35238007 109460.29711993]


0.9783259006626639

In [18]:
# Manual input for prediction
manual_input = pd.DataFrame({
    'R&D Spend': [160000],
    'Administration': [130000],
    'Marketing Spend': [300000],
    'Florida': [1],  # 1 if state is Florida, else 0
    'New York': [0]  # 1 if state is New York, else 0
})

# Apply scaling to the manual input
manual_input_scaled = scaler.transform(manual_input)

# Predict the output for the scaled manual input
manual_pred = linear_model.predict(manual_input_scaled)
print("Predicted Profit for manual input:", manual_pred[0])

Predicted Profit for manual input: 183399.49990372785
