In [None]:
pip install pandas numpy scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('Dataset.csv')

print(data.head())

In [None]:
print(data.info())
print(data.describe())
print(data.isnull().sum())  

In [None]:
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns

In [None]:
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

In [None]:
data = pd.get_dummies(data, drop_first=True)  # Convert categorical to numerical

In [None]:
X = data.drop('Aggregate rating', axis=1)  # Features
y = data['Aggregate rating']  # Target variable

# Split into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the Decision Tree Regressor
model = DecisionTreeRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)

In [None]:
feature_importances = model.feature_importances_
features = X.columns

importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
print(importance_df.sort_values(by='Importance', ascending=False))

In [None]:
import joblib

joblib.dump(model, 'decision_tree_model.pkl')
print("Model saved!")

In [None]:
loaded_model = joblib.load('decision_tree_model.pkl')
new_predictions = loaded_model.predict(X_test)

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Ratings")
plt.ylabel("Predicted Ratings")
plt.title("Actual vs Predicted Ratings")
plt.show()