## Summary: Try and predict the revenue generated from a new product based on multiple peramiters (units_sold, rating, retail_price,...)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
num_data = data.select_dtypes(include=['float64','int64'])

In [None]:
num_data.head()

In [None]:
rev = num_data['price'] * num_data['units_sold']

In [None]:
num_data['rev'] = rev

In [None]:
num_data.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
num_data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["rev", "units_sold", "rating",
              "rating_count"]
scatter_matrix(num_data[attributes], figsize=(12, 8))

In [None]:
# will make it a classification problem because there is not a lot of data,
# the buckets are in the output
import numpy as np
X = num_data.drop(['units_sold','rev'],axis=1)
y = pd.qcut(num_data['rev'], 5, labels=False)
np.unique(pd.qcut(num_data['rev'], 5).values)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X = num_pipeline.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [None]:
voting_clf.fit(X_train, y_train)


In [None]:

from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train[:1150], y_train[:1150])
    y_pred = clf.predict(X_train[1150:])
    print(clf.__class__.__name__, accuracy_score(y_train[1150:], y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

random forest has the best results so i will use random search CV to find better peramiters

In [None]:
params = {'max_leaf_nodes': list(range(80, 200)),'n_estimators':list(range(20, 200)),
          'max_features': randint(low=10, high=23),}
rnd_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), params,
                                    verbose=1, cv=3,n_iter=10,scoring='accuracy')

rnd_search.fit(X_train, y_train)

In [None]:

cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
model = rnd_search.best_estimator_

In [None]:
 for i,j in zip(num_data.drop(['units_sold','rev'],axis=1).columns.values,model.feature_importances_):
        print(i,j)

In [None]:
pred = model.predict(X_test)

In [None]:
accuracy_score(pred,y_test)

In [None]:
pred.shape

In [None]:
len(model.predict(X_test))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test.values, pred)