In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from plotly.subplots import make_subplots
import plotly.graph_objs as go


In [None]:

historic_data = pd.read_csv('historic.csv')

prediction_input = pd.read_csv('prediction_input.csv')

encoder = LabelEncoder()
historic_data['category'] = encoder.fit_transform(historic_data['category'])
historic_data['main_promotion'] = encoder.fit_transform(historic_data['main_promotion'])
historic_data['color'] = encoder.fit_transform(historic_data['color'])

X = historic_data.drop(['item_no', 'success_indicator'], axis=1)
y = historic_data['success_indicator']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)



In [None]:

print(historic_data.describe())


             item_no        stars
count    8000.000000  8000.000000
mean   546474.338375     3.473075
std    256513.463874     0.978810
min    100171.000000     0.300000
25%    325001.500000     2.800000
50%    547081.000000     3.500000
75%    764312.250000     4.200000
max    989740.000000     6.100000


In [None]:

rf_model.fit(X_train, y_train)



In [None]:
y_pred = rf_model.predict(X_test)


print(classification_report(y_test, y_pred))


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


              precision    recall  f1-score   support

        flop       0.80      0.68      0.73       571
         top       0.84      0.90      0.87      1029

    accuracy                           0.82      1600
   macro avg       0.82      0.79      0.80      1600
weighted avg       0.82      0.82      0.82      1600

Confusion Matrix:
 [[388 183]
 [100 929]]


In [None]:
feature_importances = rf_model.feature_importances_

feature_importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values('Importance', ascending=False)


fig = go.Figure(go.Bar(
    x=feature_importances_df['Feature'],
    y=feature_importances_df['Importance']
))
fig.update_layout(title='Feature Importances', xaxis_title='Feature', yaxis_title='Importance')
fig.show()


In [None]:

import joblib
joblib.dump(rf_model, 'random_forest_model.pkl')



['random_forest_model.pkl']

In [None]:
# Histogram of star ratings
fig = px.histogram(historic_data, x='stars', nbins=5, title='Distribution of Star Ratings')
fig.show()


In [52]:
# Display the first few rows of the dataset
print(historic_data.head())

# Display the last few rows of the dataset
print(historic_data.tail())

# Show the shape of the dataframe
print(historic_data.shape)

# Display general information about the dataframe
print(historic_data.info())


   item_no  category  main_promotion  color  stars success_indicator
0   739157         5               0      3    3.1              flop
1   591846         1               1      7    1.5              flop
2   337574         3               0      7    4.4               top
3   401933         2               1      1    3.1              flop
4   812151         1               1      3    4.1               top
      item_no  category  main_promotion  color  stars success_indicator
7995   280947         4               0      1    3.9              flop
7996   874952         3               3      3    3.5               top
7997   891640         4               1      9    4.0              flop
7998   786448         0               1      1    2.6              flop
7999   847223         3               2      1    3.9               top
(8000, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 6 columns):
 #   Column             Non-Null Count

In [53]:
# Display the count of unique values in each categorical column
for col in ['category', 'main_promotion', 'color']:
    print(historic_data[col].value_counts())



5    1650
2    1546
4    1459
3    1360
0    1246
1     739
Name: category, dtype: int64
1    2432
0    2246
3    2013
2    1309
Name: main_promotion, dtype: int64
4    1443
1    1244
9    1056
0     812
7     776
3     728
5     592
2     585
6     412
8     352
Name: color, dtype: int64


In [54]:
# Histogram of star ratings
fig = px.histogram(historic_data, x='stars', nbins=5, title='Distribution of Star Ratings')
fig.show()


In [55]:
# Correlation heatmap of numeric variables
numeric_features = ['stars']  # Add other numeric features if needed
correlation_matrix = historic_data[numeric_features].corr().round(2)
fig = px.imshow(correlation_matrix, labels={'x': 'Features', 'y': 'Features'}, title='Correlation Heatmap')
fig.show()


In [56]:
# Box plot for outlier detection
fig = px.box(historic_data, y='stars', title='Box Plot of Star Ratings')
fig.show()
