In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
data = pd.read_csv('/content/50_Startups.csv')

In [3]:
categorical_features = ['State']  # 'State' is  categorical column
numerical_features = ['R&D Spend', 'Marketing Spend','Administration']  #  numerical features

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, drop='first'), categorical_features),  # drop='first' to avoid dummy variable trap
    ])

In [4]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [5]:
X = data[['R&D Spend', 'Marketing Spend', 'State','Administration']]
y = data['Profit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_test)



In [9]:
from sklearn.metrics import mean_squared_error, r2_score
mse3 = mean_squared_error(y_test, y_pred)
r2_3 = r2_score(y_test, y_pred)

# Display results
print("Model 3 (R&D Spend, Marketing Spend,Administration , and State):")
print(f"Mean Squared Error: {mse3}")
print(f"R-squared: {r2_3}")


feature_names = numerical_features + list(model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features))

# Display regression equation
coefficients = model.named_steps['regressor'].coef_
intercept = model.named_steps['regressor'].intercept_

equation = f"Profit = {intercept:.2f}"
for i, coef in enumerate(coefficients):
    equation += f" + {coef:.2f} * {feature_names[i]}"

print(f"Regression Equation: {equation}")

Model 3 (R&D Spend, Marketing Spend,Administration , and State):
Mean Squared Error: 83502864.03257743
R-squared: 0.9347068473282425
Regression Equation: Profit = 42554.17 + 0.77 * R&D Spend + 0.04 * Marketing Spend + 0.03 * Administration + -959.28 * State_Florida + 699.37 * State_New York


In [11]:
import pandas as pd
# Display the first few predictions alongside the actual values
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions.head())

       Actual      Predicted
28  103282.38  103015.201598
11  144259.40  132582.277608
10  146121.95  132447.738452
41   77798.83   71976.098513
2   191050.39  178537.482211
