In [1]:
# Feature Engineering for 50_Startups Dataset

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np


In [2]:
# Load the dataset
df = pd.read_csv("50_Startups.csv")


In [3]:
# Display the first few rows
df.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# Step 1: Create Interaction Terms
df['Interaction_R&D_Marketing'] = df['R&D Spend'] * df['Marketing Spend']

In [5]:
# Step 2: Calculate Profit Ratios
df['Profit_R&D_Ratio'] = df['Profit'] / df['R&D Spend']
df['Profit_Marketing_Ratio'] = df['Profit'] / df['Marketing Spend']

In [6]:
# Step 3: Apply Log Transformation to Profit
df['Log_Profit'] = np.log(df['Profit'])


In [9]:
# Step 4: One-Hot Encode the State Variable
# Replace 'sparse' with 'sparse_output'
ohe = OneHotEncoder(sparse_output=False)
state_encoded = ohe.fit_transform(df[['State']])
state_columns = ohe.get_feature_names_out(['State'])
state_df = pd.DataFrame(state_encoded, columns=state_columns)

In [10]:
# Drop the original State column
df.drop('State', axis=1, inplace=True)


In [11]:
# Step 5: Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['R&D Spend', 'Administration', 'Marketing Spend', 'Interaction_R&D_Marketing']])
scaled_df = pd.DataFrame(scaled_features, columns=['R&D Spend_Scaled', 'Administration_Scaled', 'Marketing Spend_Scaled', 'Interaction_R&D_Marketing_Scaled'])


In [12]:
# Merge the scaled features back to the original dataframe
df = pd.concat([df, scaled_df], axis=1)

In [13]:
# Final Dataset after Feature Engineering
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Interaction_R&D_Marketing,Profit_R&D_Ratio,Profit_Marketing_Ratio,Log_Profit,R&D Spend_Scaled,Administration_Scaled,Marketing Spend_Scaled,Interaction_R&D_Marketing_Scaled
0,165349.2,136897.8,471784.1,192261.83,78009120000.0,1.162762,0.407521,12.166613,2.016411,0.560753,2.153943,3.019595
1,162597.7,151377.59,443898.53,191792.06,72176880000.0,1.17955,0.432063,12.164167,1.95586,1.082807,1.9236,2.718386
2,153441.51,101145.55,407934.54,191050.39,62594090000.0,1.245102,0.468336,12.160292,1.754364,-0.728257,1.626528,2.223479
3,144372.41,118671.85,383199.62,182901.99,55323450000.0,1.266876,0.477302,12.116706,1.554784,-0.096365,1.42221,1.847984
4,142107.34,91391.77,366168.42,166187.94,52035220000.0,1.169454,0.453857,12.020875,1.504937,-1.079919,1.281528,1.678162


In [14]:
# Summary of New Features Created
print("New Features Created:")
print("1. Interaction_R&D_Marketing")
print("2. Profit_R&D_Ratio")
print("3. Profit_Marketing_Ratio")
print("4. Log_Profit")
print("5. One-Hot Encoded State Columns")
print("6. Scaled Features")

New Features Created:
1. Interaction_R&D_Marketing
2. Profit_R&D_Ratio
3. Profit_Marketing_Ratio
4. Log_Profit
5. One-Hot Encoded State Columns
6. Scaled Features
