In [15]:

# 📥 Data Collection
import pandas as pd
dataset = pd.read_csv('50_Startups.csv')  # Load dataset

# 🧹 Data Preprocessing
# Convert categorical 'State' column into dummy variables and drop the first to avoid multicollinearity
dataset = pd.get_dummies(dataset, drop_first=True)

# 🔀 Input / Output Split
X = dataset[['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida', 'State_New York']]
y = dataset['Profit']

# 📊 Split Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 🏗️ Model Creation (Decision Tree Regressor)
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)

# 🏋️ Train Model
regressor.fit(X_train, y_train)

# 🧪 Test Set Prediction
y_pred = regressor.predict(X_test)

# 📈 Evaluation Metrics
from sklearn.metrics import r2_score
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)  # Closer to 1 means better model

# 💾 Save the Best Model
import pickle
filename = 'DecisionTree_Model.pkl'
pickle.dump(regressor, open(filename, 'wb'))



# 📦 Load the Saved Model
loaded_model = pickle.load(open(filename, 'rb'))

# 🧾 Get Inputs & 🔮 Predict
# Predict on test set
predictions = loaded_model.predict(X_test)
print("Predictions:", predictions)

# Predict for a new data point
new_data = [[160000, 130000, 300000, 0, 1]]
new_prediction = loaded_model.predict(new_data)
print("Prediction for new input:", new_prediction)

# 📣 Call to Action
# These predictions can be integrated into a business dashboard or web app for decision-making.

R-squared: 0.9087310670610367
Predictions: [101004.64 141585.52 141585.52  78239.91 182901.99 118474.03  71498.49
 101004.64 118474.03 182901.99  89949.14  89949.14 126992.93  89949.14
 125370.37]
Prediction for new input: [192261.83]


