In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, cross_val_score


## Loading The data

In [None]:
df = pd.read_csv('../input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv')
df.head()

## Some Basic Data Exploration

In [None]:
df.head().T

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

## Dropping Some Columns
Dropping the columns with many unique categories coz they wont be very helpful and they will just add many dimensions to the data.

In [None]:
max_unique = 60
high_unique = [col for col in df.select_dtypes(exclude=np.number)
                   if df[col].nunique() > max_unique]
df = df.drop(columns=high_unique)
df.info()

In [None]:
df.head().T

In [None]:
df['currency_buyer'].value_counts()

## Building correlation

In [None]:
corr = df.corr()
corr

In [None]:
fig,ax = plt.subplots(figsize = (16,16))
ax = sns.heatmap(corr,
                 annot=True,
                 linewidths=1.2,
                 fmt=".2f",
                 cmap="YlGnBu");

## Visualizing some columns

In [None]:
sns.countplot(df['origin_country']);

In [None]:
sns.countplot(df['urgency_text']);

In [None]:
sns.barplot(x = df.origin_country,y = df.units_sold);

In [None]:
df['has_urgency_banner'].value_counts()

Dropping some more columns

In [None]:
df = df.drop(['crawl_month','origin_country','rating_count','shipping_option_name','urgency_text'],axis = 1)

In [None]:
df.head().T

In [None]:
df.info()

# LabelEncoding 'currency_buyer'& 'theme' columns

In [None]:
le = LabelEncoder()
df['currency_buyer'] = le.fit_transform(df['currency_buyer'])
df['theme'] = le.fit_transform(df['theme'])

Checking all the columns with Missing values

In [None]:
for label,content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            print(label)

In [None]:
df['has_urgency_banner'].value_counts()

filling 'has_urgency_banner' column with 0 

In [None]:
df['has_urgency_banner'] = df['has_urgency_banner'].fillna(0)

filling all the other columns with meadian

In [None]:
for label,content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            df[label] = content.fillna(content.median())

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.isna().sum()

# Splitting the data into Training and Testing sets

In [None]:
x = df.drop('units_sold',axis = 1)
y = df['units_sold']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2)

# Building the models
# RandomForest

In [None]:
model = RandomForestRegressor(n_estimators = 1000,random_state = 42)
model.fit(x_train,y_train)

In [None]:
model.score(x_test,y_test)

# LinearRegression

In [None]:
model1 = LinearRegression()
model1.fit(x_train,y_train)

In [None]:
model1.score(x_test,y_test)

In [None]:
scores = pd.DataFrame({'RandomForest': model.score(x_test,y_test),
                       'LinearRegression': model1.score(x_test,y_test)},
                        index = [0])


# Plotting the scores of both the model

In [None]:
scores.T.plot(kind = 'bar',
              figsize = (10,10))
plt.title('Scores of all Model')
plt.xlabel('Model Name')
plt.ylabel('Scores');