# Ecom Machine Learning Project

In [1]:
import pandas as pd
import os
from os import listdir
import re
import json
import shutil
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics  import roc_auc_score,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from datetime import datetime
from application_logging import logger

# Class: App Logger
This class will log the steps

In [66]:
class App_logger:
    def __init__(self):
        pass
    
    def log(self,file_object, log_message):
        self.now = datetime.now()
        self.date = self.now.date()
        self.current_time = self.now.strftime('%H:%M:%S')
        file_object = file_object.write(str(self.date) + '/' + str(self.current_time) + '\t\t' + log_message + '\n')

# Class: Get data
This class helps in reading the files

In [17]:
class data_getter:
    def __init__(self):
        self.path = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Training_Raw_Files\\Good_Raw_Files' 
        self.log_writer = logger.App_logger()
        self.file_object = open("Ecom_logs/LogData.txt", 'a+')
        
    def read_files(self, filename):
        self.log_writer.log(self.file_object, "Start reading file")
        try:
            self.all_files = self.path + '/' + filename + '.csv'
            self.data = pd.read_csv(self.all_files)
            self.log_writer.log(self.file_object, 'Data load successful. Exited the data_getter class.')
            return self.data
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Exception occurred in read_files method of data_getter class. Exception message:' + str(e))
            self.log_writer.log(self.file_object, 'Data load unsuccessful. Exited the data_getter class.')
            self.file_object.close()
            raise Exception() 

# Class: Data Validation
This class shall be used for handling all the validation done on the Raw Training Data.

In [18]:
class Raw_data_validation:
    def __init__(self):
        self.Batch_directory = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects/'
        self.log_writer = logger.App_logger()
        self.file_object = open("Ecom_logs/LogData.txt", 'a+')
                
    def manualRegexCreation(self):
        regex = 'olist'
        return regex
    
    def createdirectoryforGoodBadData(self):
        self.log_writer.log(self.file_object, 'Good/Bad file directory making started.')
        path = os.path.join(self.Batch_directory, 'Ecom_project/', 'Training_Raw_files/','Good_Raw_files')
        try:
            if not os.path.isdir(path):
                os.makedirs(path)
            self.log_writer.log(self.file_object, 'Good_Raw_Files dir made.')
            
        except Exception as e:
            self.log_writer.log(self.file_object, 'Exception occured in making good_raw_file directory. Exception message:' + str(e))
        
        try:
            path = os.path.join(self.Batch_directory, 'Ecom_project/', 'Training_Raw_files/','Bad_Raw_files')
            if not os.path.isdir(path):
                os.makedirs(path)
            self.log_writer.log(self.file_object, 'Bad_Raw_Files dir made.')
            
        except Exception as e:
            self.log_writer.log(self.file_object, 'Exception occured in making bad_raw_file directory. Exception message:' + str(e))
            self.log_object.close()
            raise Exception()
        
    def validationFileName(self, regex):
        self.createdirectoryforGoodBadData()
        self.log_writer.log(self.file_object, 'File Validation started')
        self.directory = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\dataset_files/'
        try:
            onlyfiles = [f for f in listdir(self.directory)]
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    shutil.copy(self.directory+filename, self.Batch_directory+'Ecom_project'+'/Training_Raw_Files'+'/Good_Raw_Files'+'/'+filename)
            self.log_writer.log(self.file_object, 'File Validation by name completed and file transfered to Good_Raw_Files directory. Exited the class Raw Data Validation')
            
        except Exception as e:
            self.log_writer.log(self.file_object, 'File Validation error occured. Exception message:'+ str(e))
            self.log_object.close()
            raise Exception()

# Class:Preprocessing
This class will merge the datasets, remove columns, check null values, seperate label features, handle imbalanced dataset and encode categorical data.

In [21]:
class preprocessing:
    def __init__(self):
        self.path ='C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Training_Raw_Files\\Good_Raw_Files'
        self.log_writer = logger.App_logger()
        self.file_object = open("Ecom_logs/LogData.txt", 'a+')
    
    def createMregeDataDirectory(self):
        try:
            self.dir_path = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project'
            path = os.path.join(self.dir_path, 'Merge_data/')
            if not os.path.isdir(path):
                os.makedirs(path)
            self.log_writer.log(self.file_object, 'Creating Merge Data Directory.')
            
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error occured in making merge data directory. Exception message:'+str(e))
            self.file_object.close()
            raise Exception()
    
    def merge_files(self, file1, file2):
        try:
            self.createMregeDataDirectory()
            self.filename1 = self.path + '/' + file1 + '.csv'
            self.file1 = pd.read_csv(self.filename1)
            self.filename2 = self.path + '/' + file2 + '.csv'
            self.file2 = pd.read_csv(self.filename2)
            self.common_col = self.file1.columns.intersection(self.file2.columns)
            self.merge_df = self.file1.merge(self.file2, on = self.common_col[0])
            self.save_merge_df = self.merge_df.to_csv(self.path + '/' +'merge_data.csv', index = False)
            shutil.copy(self.path+'/merge_data.csv', self.dir_path+'/Merge_data')
            self.log_writer.log(self.file_object, 'Data Merged successfully.')
            return self.merge_df
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error occured while merging the data. Exception message:'+ str(e))
            self.log_writer.close()
            raise Exception()
    
        
    def remove_columns(self, final_df, columns):    #### Removes the unwanted columns
        try:
            self.data = final_df
            self.columns = columns
            self.df_useful_cols = self.data.drop(labels = self.columns, axis=1)
            self.log_writer.log(self.file_object, 'Columns removed. Exited the Preprocessing class')
            return self.df_useful_cols
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error occured while removing the columns. Exception message:'+str(e))
            self.file_object.close()
            raise Exception()
    
    def is_null_present(self, final_df):    #### checking the null values and saving CSV file for null values
        try:
            null_present = False
            cols_with_missing_values = []
            cols = final_df.columns
            null_counts = final_df.isnull().sum()
            for i in range(len(null_counts)):
                if null_counts[i]>0:
                    null_present = True
                    cols_with_missing_values.append(cols[i])
                    
            if (null_present):
                dataframe_with_null = pd.DataFrame()
                dataframe_with_null['columns'] = final_df.columns
                dataframe_with_null['missing values count'] = np.asarray(final_df.isna().sum())
                dataframe_with_null.to_csv('null_values.csv', index=False)
            self.log_writer.log(self.file_object, 'Checking Missing/Null values present.')
            return dataframe_with_null
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error while checking null values. Exception message:'+str(e))
            self.file_object.close()
            raise Exception()
        
    def seperate_label_features(self, data, label_column_name):    #### This Method seperates the label and the features columns
        try:
            self.data = data
            self.X = self.data.drop(labels= label_column_name, axis=1)
            self.Y = self.data[label_column_name]
            self.log_writer.log(self.file_object, 'Seperated feature columns and Target feature.')
            return self.X, self.Y
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error while seperating target and feature columns. Exception message:'+str(e))
            self.file_object.close()
            raise Exception()
            
        
    def handle_imb_dataset(self, x, y):    #### This Method handles the imbalance dataset to make it balanced
        try:
            self.rdsample = RandomOverSampler()
            self.x_sampled, self.y_sampled = self.rdsample.fit_resample(x,y)
            self.log_writer.log(self.file_object, 'Imbalanced dataset converted into balanced dataset.')
            return self.x_sampled, self.y_sampled
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in making balanced dataset. Exception message:'+str(e))
            self.file_object.close()
            raise Exception()
    
    def createFinalData(self):
        self.dir_path = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project'
        path = os.path.join(self.dir_path, 'Final_Data/')
        if not os.path.isdir(path):
            os.makedirs(path)
        
    def encode_cat_cols(self, data):    #### This Method encodes the categorical columns into numeric values
        try:
            self.createFinalData()
            self.data = data
            self.cat_features = [features for features in self.data.columns if self.data[features].dtypes=="O"]
            self.final_file = pd.get_dummies(self.data, columns= self.cat_features, drop_first=True)
            self.path = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Final_Data'
            self.save_final_file = self.final_file.to_csv(self.path + '/' + 'preprocessed_file.csv', index = False)
            self.log_writer.log(self.file_object, 'Encoded the categorical column into numeric values.')
            return self.final_file
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in encoding categorical features. Exception message'+str(e))
            self.file_object.close()
            raise Exception()
    
    def feature_selection(self, xr,yr):
        try:
            self.selectkbest = SelectKBest(chi2, k=10)
            self.x_new = self.selectkbest.fit_transform(xr,yr)
            self.log_writer.log(self.file_object, 'Top 10 features selected using SelectKbest.')
            return self.x_new
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error occured while selecting the best features. Exception message:'+str(e))
            self.file_object.close()
            raise Exception()
    
    def train_test_split(self, x_new, yr):
        try:
            self.split = train_test_split(x_new, yr, test_size=0.2)
            self.x_train, self.x_test, self.y_train, self.y_test = self.split
            self.log_writer.log(self.file_object, 'Splited the data into train test in 80:20.')
            return self.x_train, self.x_test, self.y_train, self.y_test
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in splitting the data. Exception message:'+str(e))
            self.file_object.close()
            raise Exception()
        

# Class: handling missing values 
This class helps in handling the missing values

In [22]:
class handling_missing_values:
    def __init__(self):
        self.path ='C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Training_Raw_Files\\Good_Raw_Files'
        self.data = data
        self.log_writer = logger.App_logger()
        self.file_object = open("Ecom_logs/LogData.txt", 'a+')
        
    def createCleanDataFolder(self):
        self.dir_path = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project'
        path = os.path.join(self.dir_path, 'Clean_Data/')
        if not os.path.isdir(path):
            os.makedirs(path)
        
    def impute_missing_data(self, data, feature_name):    #### impute the missing value in the all merged dataset and save the CSV file with new data
        try:
            self.createCleanDataFolder()
            self.feature_name = feature_name
            self.mean_feature = self.data[self.feature_name].mean()
            self.data[self.feature_name] = self.data[self.feature_name].fillna(self.mean_feature)
            self.data = self.data.to_csv(self.dir_path + '/' + 'Clean_Data'+'/'+'clean.csv', index = False)
            self.log_writer.log(self.file_object, 'Imputed the missing values in the merged dataset and saving it into CSV file.')
            return self.data
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in imputing the missing values. Exception message:'+str(e))
            self.file_object.close()
            raise Exception()
        
    def impute_missing_values(self, file, feature_name):    #### Impute the missing values of numerical features and saving it in clean CSV file 
        self.dir_path = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Clean_Data'
        self.filename = self.dir_path + '/' + file + '.csv'
        self.file = pd.read_csv(self.filename)
        self.feature_name = feature_name
        self.mean_value = self.file[self.feature_name].mean()
        self.file[self.feature_name] = self.file[self.feature_name].fillna(self.mean_value)
        self.file = self.file.to_csv(self.dir_path + '/' + 'clean.csv', index = False)
        return self.file
    
    def impute_cat_missing_data(self, file, cat_feature):    #### Imputing the missing categorical features and saving the data in clean CSV file
        try:
            self.dir_path = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Clean_Data'
            self.filename = self.dir_path + '/' + file + '.csv'
            self.file = pd.read_csv(self.filename)
            self.cat_feature = cat_feature
            self.file[self.cat_feature] = self.file[self.cat_feature].fillna('other')
            self.file = self.file.to_csv(self.dir_path + '/' + 'clean.csv', index=False)
            self.log_writer.log(self.file_object, 'Imputed the categorical missing values and saving the data into Clean CSV file.')
            return self.file
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in imputing the categorical missing values. Exception message:' + str(e))
            self.file_object.close()
            raise Exception()

# Class: Model finder
this calss will have to find the best model

In [23]:
class model_finder:
    def __init__(self):
        self.dt = DecisionTreeClassifier()
        self.rf = RandomForestClassifier()
        self.log_writer = logger.App_logger()
        self.file_object = open("Ecom_logs/LogData.txt", 'a+')
        
    def dt_model(self, x_train, y_train):
        try:
            self.dt = DecisionTreeClassifier()
            self.dt.fit(x_train, y_train)
            self.log_writer.log(self.file_object, 'Training the model with Decision Tree Classifier.')
            return self.dt
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in training the DT model. Exception message:' + str(e))
            self.file_object.close()
            raise Exception()
    
    def rf_model(self, x_train, y_train):
        try:
            self.rf = RandomForestClassifier()
            self.rf.fit(x_train, y_train)
            self.log_writer.log(self.file_object, 'Training the model with Random Forest Classifier.')
            return self.rf
        
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in training the RF model. Exception message:' + str(e))
            self.file_object.close()
            raise Exception()        
        
    
    def get_best_model(self, x_train, x_test, y_train, y_test):
        try:
            self.decision_tree = self.dt_model(x_train, y_train)
            self.random_forest = self.rf_model(x_train, y_train)
            self.dt_score = self.decision_tree.score(x_test, y_test)
            self.rf_score = self.random_forest.score(x_test, y_test)
            if self.dt_score > self.rf_score:
                return ('Decision_Tree_model',self.dt_score.round(decimals=3)*100)
            else:
                self.log_writer.log(self.file_object, 'Selected the best model.')
                return ('Random_Forest_model',self.rf_score.round(decimals=3)*100)
            
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in selecting the best model. Exception message:' + str(e))
            self.file_object.close()
            raise Exception()

# Saving the model
## Class File operation
 Description: This class shall be used to save the model after training
                and load the saved model for prediction.

In [24]:
class file_operation:
    def __init__(self):
        self.model_directory = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project'
        self.log_writer = logger.App_logger()
        self.file_object = open("Ecom_logs/LogData.txt", 'a+')
    
    def createmodelfolder(self):
        path = os.path.join(self.model_directory, 'Models/')
        if not os.path.isdir(path):
            os.makedirs(path)
    
    def save_model(self, model, filename):
        try:
            self.createmodelfolder()
            self.dir = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Models'
            path = os.path.join(self.dir, filename)
            if not os.path.isdir(path):
                os.makedirs(path)
                   
            with open(path + '/' + filename + '.pkl', 'wb') as f:
                pickle.dump(model,f)
            self.log_writer.log(self.file_object, 'Model saved.')
            
        except Exception as e:
            self.log_writer.log(self.file_object, 'Error in saving the models. Exception message:' + str(e))
            self.file_object.close()
            raise Exception()
            
    def load_model(self, filename):
        self.dir = 'C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Models'
        with open(self.dir + '/' + filename + '/' + filename + '.pkl', 'rb') as f:
            return pickle.load(f)

### Evaluation of dataset

In [19]:
### File validation
file_dir = Raw_data_validation().validationFileName(regex='olist')

In [20]:
### Reading the CSV files
reviews = data_getter().read_files('olist_order_reviews_dataset') 
reviews.head(1)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59


In [25]:
# Merge review and order data

review_order_df = preprocessing().merge_files('olist_order_reviews_dataset', 'olist_orders_dataset')
review_order_df.shape

(99224, 14)

In [26]:
# Merging review_order_df with payment dataset
review_order_payment = preprocessing().merge_files('merge_data', 'olist_order_payments_dataset')
review_order_payment.shape

(103677, 18)

In [27]:
# Merging review_order with customers dataset
review_order_payment_cust = preprocessing().merge_files('merge_data', 'olist_customers_dataset')
review_order_payment_cust.shape

(103677, 22)

In [28]:
### Merging reviews_orders_payments_cust with items dataset
review_order_payment_cust_items = preprocessing().merge_files('merge_data', 'olist_order_items_dataset')
review_order_payment_cust_items.shape

(117329, 28)

In [29]:
# Merging review_order_payment_cust_items and products datasets
review_order_payment_cust_items_products = preprocessing().merge_files('merge_data', 'olist_products_dataset')
review_order_payment_cust_items_products.shape

(117329, 36)

In [30]:
# Merge review_order_payment_cust_items_products and seller dataset
review_order_payment_cust_items_products_sellers = preprocessing().merge_files('merge_data', 'olist_sellers_dataset')
review_order_payment_cust_items_products_sellers.shape

(117329, 39)

In [32]:
### Reading file after merging the dataset
final_df = data_getter().read_files('merge_data') 
final_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,customer_id,order_status,order_purchase_timestamp,...,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,seller_zip_code_prefix,seller_city,seller_state
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59,41dcb106f807e993532d446263290104,delivered,2018-01-11 15:30:49,...,42.0,858.0,1.0,1300.0,30.0,30.0,35.0,14600,sao joaquim da barra,SP
1,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59,41dcb106f807e993532d446263290104,delivered,2018-01-11 15:30:49,...,42.0,858.0,1.0,1300.0,30.0,30.0,35.0,14600,sao joaquim da barra,SP
2,e2655da8bccce2ddc2f43cc2c80d24f3,19721d59a9764437477eff0730b48665,5,,,2018-04-10 00:00:00,2018-04-11 11:30:43,b1757da2a461d581e1a3b280ca181b7c,delivered,2018-03-26 11:02:31,...,42.0,858.0,1.0,1300.0,30.0,30.0,35.0,14600,sao joaquim da barra,SP
3,cb18f53b563f0bdeb76be0376ddd6b6a,efd626e6a12a82d76e456e34093f8356,5,,,2017-12-28 00:00:00,2017-12-30 12:07:50,da2bd16b131bc90653b87fe4d8fd266e,delivered,2017-12-07 20:33:24,...,42.0,858.0,1.0,1300.0,30.0,30.0,35.0,14600,sao joaquim da barra,SP
4,5fdb2ed7aa0f1b1c9642175b302e38af,b0bc656fed47584c160f8cff2d7b8bbd,5,,,2018-03-17 00:00:00,2018-03-17 17:36:27,f4ad1c9eb0bed4469cbe0ff8f47bf634,delivered,2018-02-27 13:56:12,...,47.0,858.0,1.0,1600.0,30.0,30.0,35.0,14600,sao joaquim da barra,SP


In [33]:
### Checking the missing values

missing_values = preprocessing().is_null_present(final_df)
missing_values

Unnamed: 0,columns,missing values count
0,review_id,0
1,order_id,0
2,review_score,0
3,review_comment_title,103437
4,review_comment_message,67650
5,review_creation_date,0
6,review_answer_timestamp,0
7,customer_id,0
8,order_status,0
9,order_purchase_timestamp,0


In [34]:
### Removing unwanted columns
data = preprocessing().remove_columns(final_df, columns = ['review_id', 'order_id', 'review_comment_title', 'review_comment_message', 
                                                          'customer_unique_id','customer_id', 'order_approved_at', 'order_delivered_carrier_date',
                                                          'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
                                                          'review_creation_date', 'review_answer_timestamp', 'order_purchase_timestamp',
                                                          'order_delivered_customer_date', 'order_estimated_delivery_date', 'product_weight_g', 
                                                           'product_length_cm', 'product_height_cm', 'product_width_cm'])

In [35]:
data.shape

(117329, 18)

In [36]:
data.head()

Unnamed: 0,review_score,order_status,payment_sequential,payment_type,payment_installments,payment_value,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,seller_zip_code_prefix,seller_city,seller_state
0,4,delivered,1,credit_card,8,397.26,6030,osasco,SP,185.0,13.63,esporte_lazer,42.0,858.0,1.0,14600,sao joaquim da barra,SP
1,4,delivered,1,credit_card,8,397.26,6030,osasco,SP,185.0,13.63,esporte_lazer,42.0,858.0,1.0,14600,sao joaquim da barra,SP
2,5,delivered,1,credit_card,3,205.26,95860,taquari,RS,185.0,20.26,esporte_lazer,42.0,858.0,1.0,14600,sao joaquim da barra,SP
3,5,delivered,1,credit_card,2,202.05,36500,uba,MG,185.0,17.05,esporte_lazer,42.0,858.0,1.0,14600,sao joaquim da barra,SP
4,5,delivered,1,credit_card,1,306.4,38412,uberlandia,MG,135.0,18.2,esporte_lazer,47.0,858.0,1.0,14600,sao joaquim da barra,SP


In [37]:
missing_values = preprocessing().is_null_present(data)
missing_values

Unnamed: 0,columns,missing values count
0,review_score,0
1,order_status,0
2,payment_sequential,0
3,payment_type,0
4,payment_installments,0
5,payment_value,0
6,customer_zip_code_prefix,0
7,customer_city,0
8,customer_state,0
9,price,0


In [38]:
### imputing missing numerical data
product_name_lenght = handling_missing_values().impute_missing_data(data, 'product_name_lenght')

In [39]:
product_description_lenght = handling_missing_values().impute_missing_values('clean', 'product_description_lenght')

In [40]:
product_photos_qty = handling_missing_values().impute_missing_values('clean', 'product_photos_qty')

In [41]:
### imputing missing categorical data and saving the file as clean data
clean_data = handling_missing_values().impute_cat_missing_data('clean', 'product_category_name')

In [42]:
### Reading the clean dataset and checking the missing values
final_df = pd.read_csv('C:\\Users\\rahul.goyal\\Desktop\\Data Science\\Machine Learning\\Projects\\Ecom_project\\Clean_Data/clean.csv')
final_df.isnull().sum()

review_score                  0
order_status                  0
payment_sequential            0
payment_type                  0
payment_installments          0
payment_value                 0
customer_zip_code_prefix      0
customer_city                 0
customer_state                0
price                         0
freight_value                 0
product_category_name         0
product_name_lenght           0
product_description_lenght    0
product_photos_qty            0
seller_zip_code_prefix        0
seller_city                   0
seller_state                  0
dtype: int64

In [43]:
final_df.shape

(117329, 18)

In [44]:
final_df.columns

Index(['review_score', 'order_status', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'price', 'freight_value',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty',
       'seller_zip_code_prefix', 'seller_city', 'seller_state'],
      dtype='object')

In [45]:
cat_features = [features for features in data.columns if data[features].dtypes=="O"]
cat_features

['order_status',
 'payment_type',
 'customer_city',
 'customer_state',
 'product_category_name',
 'seller_city',
 'seller_state']

In [46]:
### Removing unwanted columns
data = preprocessing().remove_columns(final_df, columns = ['customer_city','product_name_lenght', 'product_description_lenght',
                                                          'product_photos_qty', 'seller_zip_code_prefix', 'seller_city'])

In [47]:
data.head()

Unnamed: 0,review_score,order_status,payment_sequential,payment_type,payment_installments,payment_value,customer_zip_code_prefix,customer_state,price,freight_value,product_category_name,seller_state
0,4,delivered,1,credit_card,8,397.26,6030,SP,185.0,13.63,esporte_lazer,SP
1,4,delivered,1,credit_card,8,397.26,6030,SP,185.0,13.63,esporte_lazer,SP
2,5,delivered,1,credit_card,3,205.26,95860,RS,185.0,20.26,esporte_lazer,SP
3,5,delivered,1,credit_card,2,202.05,36500,MG,185.0,17.05,esporte_lazer,SP
4,5,delivered,1,credit_card,1,306.4,38412,MG,135.0,18.2,esporte_lazer,SP


In [48]:
data.shape

(117329, 12)

In [49]:
### Encoding categorical features
data = preprocessing().encode_cat_cols(data)

In [50]:
data.head(2)

Unnamed: 0,review_score,payment_sequential,payment_installments,payment_value,customer_zip_code_prefix,price,freight_value,order_status_canceled,order_status_delivered,order_status_invoiced,...,seller_state_PE,seller_state_PI,seller_state_PR,seller_state_RJ,seller_state_RN,seller_state_RO,seller_state_RS,seller_state_SC,seller_state_SE,seller_state_SP
0,4,1,8,397.26,6030,185.0,13.63,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,4,1,8,397.26,6030,185.0,13.63,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [51]:
### Seperate Labels and features
x, y = preprocessing().seperate_label_features(data, 'review_score')

In [52]:
y.value_counts()

5    66264
4    22286
1    14854
3     9840
2     4085
Name: review_score, dtype: int64

In [53]:
x.shape

(117329, 136)

In [54]:
x.head()

Unnamed: 0,payment_sequential,payment_installments,payment_value,customer_zip_code_prefix,price,freight_value,order_status_canceled,order_status_delivered,order_status_invoiced,order_status_processing,...,seller_state_PE,seller_state_PI,seller_state_PR,seller_state_RJ,seller_state_RN,seller_state_RO,seller_state_RS,seller_state_SC,seller_state_SE,seller_state_SP
0,1,8,397.26,6030,185.0,13.63,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,8,397.26,6030,185.0,13.63,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,3,205.26,95860,185.0,20.26,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,2,202.05,36500,185.0,17.05,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,1,306.4,38412,135.0,18.2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


Dataset is imbalanced.

In [55]:
### Making the balaced dataset
xr, yr = preprocessing().handle_imb_dataset(x, y)

In [56]:
yr.value_counts()

4    66264
5    66264
1    66264
3    66264
2    66264
Name: review_score, dtype: int64

In [57]:
xr.head()

Unnamed: 0,payment_sequential,payment_installments,payment_value,customer_zip_code_prefix,price,freight_value,order_status_canceled,order_status_delivered,order_status_invoiced,order_status_processing,...,seller_state_PE,seller_state_PI,seller_state_PR,seller_state_RJ,seller_state_RN,seller_state_RO,seller_state_RS,seller_state_SC,seller_state_SE,seller_state_SP
0,1,8,397.26,6030,185.0,13.63,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,8,397.26,6030,185.0,13.63,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,3,205.26,95860,185.0,20.26,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,2,202.05,36500,185.0,17.05,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,1,306.4,38412,135.0,18.2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [58]:
### Selecting 10 best features from the dataset
x_new = preprocessing().feature_selection(xr, yr)

In [59]:
### Train test split 
x_train, x_test, y_train, y_test = preprocessing().train_test_split(x_new, yr)

In [60]:
x_train.shape

(265056, 10)

In [61]:
y_train.shape

(265056,)

In [62]:
### Creating Decision tree model and checking accuracy score
dt = model_finder().dt_model(x_train, y_train)
dt.score(x_test, y_test).round(decimals=3)*100

90.5

In [63]:
### Creating Random Forest model and checking accuracy score
rf = model_finder().rf_model(x_train, y_train)
rf.score(x_test, y_test).round(decimals=3)*100

92.9

In [64]:
### Selcting the best model
best_model = model_finder().get_best_model(x_train, x_test, y_train, y_test)
best_model

('Random_Forest_model', 93.0)

In [65]:
### Saving the decision tree model
sav_model_dt = file_operation().save_model(dt, 'decision_tree')

In [66]:
### Saving the Random Forest model
sav_model_rf = file_operation().save_model(rf, 'random_forest')

In [56]:
### Loading the Decision tree model
load_dt = file_operation().load_model('decision_tree')
load_dt.score(x_test,y_test).round(decimals=3)*100

90.4

In [57]:
### Loading the Random Forest model
load_rf = file_operation().load_model('random_forest')
load_rf.score(x_test, y_test).round(decimals=3)*100

93.0

In [58]:
### Checking the accuracy of the predicted value
data = [[3, 205.26, 95860, 185, 20.26, 0, 0, 0, 0, 0]]
new_df = pd.DataFrame(data)

In [59]:
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,3,205.26,95860,185,20.26,0,0,0,0,0


In [60]:
check = dt.predict(new_df)

In [61]:
print(check)

[5]


# End

### Checking which features selected

In [45]:
# Feature selection of top 10 features
x_new_check = SelectKBest(chi2, k=10).fit(xr, yr)

In [46]:
boolean_value = x_new_check.get_support()
boolean_value

array([False,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [47]:
df = xr[xr.columns[boolean_value]]

In [48]:
df.head()

Unnamed: 0,payment_installments,payment_value,customer_zip_code_prefix,price,freight_value,order_status_canceled,order_status_invoiced,order_status_processing,order_status_shipped,customer_state_RJ
0,8,397.26,6030,185.0,13.63,0,0,0,0,0
1,8,397.26,6030,185.0,13.63,0,0,0,0,0
2,3,205.26,95860,185.0,20.26,0,0,0,0,0
3,2,202.05,36500,185.0,17.05,0,0,0,0,0
4,1,306.4,38412,135.0,18.2,0,0,0,0,0


Now, the dataset is balanced