# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer

import seaborn as sns
import os
import sys

# Import Datasets

In [None]:
# Import Datasets
df_or = pd.read_csv('datasets/OrderReports.csv')
print(df_or.head())

In [None]:
df_pl = pd.read_csv('datasets/ProductList.csv')
print(df_pl.head())

In [None]:
df_rr = pd.read_csv('datasets/RevenueReport.csv')
print(df_rr.head())

In [None]:
df_wb = pd.read_csv('datasets/website_bestsellers.csv')
print(df_wb.head())

In [None]:
df_ttb = pd.read_csv('datasets/tiktok_bestsellers.csv')
print(df_ttb.head())

# Identifying and Handling Missing Values, Duplicate Data, and Incorrect Data Types

In [None]:
# Find out the number of rows and columns in each dataset
print("OrderReport: ", df_or.shape)
print("ProductList: ", df_pl.shape)
print("RevenueReport: ", df_rr.shape)
print("WebsiteBestsellers: ", df_wb.shape)
print("TiktokBestsellers: ", df_ttb.shape)

In [None]:
# Find out the number of missing values in each dataset
print("OrderReport: \n", df_or.isnull().sum())
print("ProductList: \n", df_pl.isnull().sum())
print("RevenueReport: \n", df_rr.isnull().sum())
print("WebsiteBestsellers: \n", df_wb.isnull().sum())
print("TiktokBestsellers: \n", df_ttb.isnull().sum())

In [None]:
# Find out the data types of each column in each dataset
print("OrderReport: \n", df_or.dtypes)
print("ProductList: \n", df_pl.dtypes)
print("RevenueReport: \n", df_rr.dtypes)
print("WebsiteBestsellers: \n", df_wb.dtypes)
print("TiktokBestsellers: \n", df_ttb.dtypes)

In [None]:
# Find out the number of duplicate rows in each dataset
print("OrderReport: ", df_or.duplicated().sum())
print("ProductList: ", df_pl.duplicated().sum())
print("RevenueReport: ", df_rr.duplicated().sum())
print("WebsiteBestsellers: ", df_wb.duplicated().sum())
print("TiktokBestsellers: ", df_ttb.duplicated().sum())

In [None]:
# # remove columns with all null values
# df_or = df_or.dropna(axis=1, how='all')
# df_pl = df_pl.dropna(axis=1, how='all')
# df_rr = df_rr.dropna(axis=1, how='all')
# df_wb = df_wb.dropna(axis=1, how='all')
# df_ttb = df_ttb.dropna(axis=1, how='all')
# 
# # save back to csv
# df_or.to_csv('datasets/OrderReport.csv', index=False)
# df_pl.to_csv('datasets/ProductList.csv', index=False)
# df_rr.to_csv('datasets/RevenueReport.csv', index=False)
# df_wb.to_csv('datasets/website_bestsellers.csv', index=False)
# df_ttb.to_csv('datasets/tiktok_bestsellers.csv', index=False)

In [None]:
# # print the head of each dataset
# print("OrderReport: \n", df_or.head())
# print("ProductList: \n", df_pl.head())
# print("RevenueReport: \n", df_rr.head())
# print("WebsiteBestsellers: \n", df_wb.head())
# print("TiktokBestsellers: \n", df_ttb.head())

In [None]:
# check empty sku
# print(df_pl[df_pl['SKU'].isnull()])

# add item sold column
total = df_or['Items sold'].sum()
print(total)


# generate sku of 5 digits and random assign to empty sku
random_sku = np.random.randint(10000, 99999)
while random_sku in df_pl['SKU']:
    random_sku = np.random.randint(10000, 99999)
    
df_pl['SKU'] = df_pl['SKU'].fillna(random_sku)
print(df_pl[df_pl['SKU'].isnull()])

# save to csv
df_pl.to_csv('datasets/ProductList.csv', index=False)

# Data Augmentation

## Add a new column to the OrderReport dataset called 'State' and populate it with random states

In [None]:
# Malaysian States
states = ['Johor', 'Kedah', 'Kelantan', 'Labuan', 'Melaka', 'Negeri Sembilan', 'Pahang', 'Penang', 'Perak', 'Perlis', 'Putrajaya', 'Sabah', 'Sarawak', 'Selangor', 'Terengganu', 'WP Kuala Lumpur']

# states distribution
states_distribution = {
    'Johor': 0.6,
    'Kedah': 0.5,
    'Kelantan': 0.5,
    'Labuan': 0.01,
    'Melaka': 0.6,
    'Negeri Sembilan': 0.6,
    'Pahang': 0.4,
    'Penang': 0.3,
    'Perak': 0.4,
    'Perlis': 0.5,
    'Putrajaya': 0.06,
    'Sabah': 0.02,
    'Sarawak': 0.02,
    'Selangor': 0.8,
    'Terengganu': 0.4,
    'WP Kuala Lumpur': 0.2
}

# Initialize random number generator with a fixed seed for consistency
random.seed(42)

# Sample the same number of states as rows in your existing dataset
existing_rows = len(df_or)
sampled_states = random.choices(states, weights=states_distribution.values(), k=existing_rows)
# Assign the sampled states to your existing DataFrame
df_or['State'] = sampled_states

# Save the DataFrame to a CSV file
df_or.to_csv('datasets/OrderReports.csv', index=False)

## Adding a new column called to the OrderReport dataset called 'Age' and populate it with random ages

In [None]:
age_distribution = [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]
# the percentage of customers in each age group
age_distribution_percentage = [0.05, 0.05, 0.05, 0.05, 0.05, 0.05,
                               0.10, 0.10, 0.10, 0.10, 0.10, 0.10, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]  # 100%

random.seed(42)

# Sample the same number of customers as rows in your existing dataset
existing_rows = len(df_or)
sampled_age = random.choices(age_distribution, weights=age_distribution_percentage, k=existing_rows)

# Assign the sampled age to your existing DataFrame
df_or['Age'] = sampled_age

# Save the DataFrame to a CSV file
df_or.to_csv('datasets/OrderReports.csv', index=False)

print(df_or.head())
print(df_or['Age'].value_counts())
print(df_or['Gender'].value_counts())


## Creating a Machine Learning Model using Logistic Regression to predict gender of names

In [None]:
df_mn = pd.read_csv('datasets/MalaysianNames_clean.csv')

name_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
name_features = name_vectorizer.fit_transform(df_mn['name'])

label_encoder = LabelEncoder()
df_mn["gender"] = label_encoder.fit_transform(df_mn["gender"])

X_train, X_test, y_train, y_test = train_test_split(name_features, df_mn["gender"], test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy}")

new_names = df_or['Customer'].tolist()
# new_names = ['Lee Mann Heyy']
new_name_features = name_vectorizer.transform(new_names)
new_name_features = new_name_features.toarray()

# predict gender of new names
new_names_pred = model.predict(new_name_features)
new_names_pred = label_encoder.inverse_transform(new_names_pred)
print(new_names_pred)

#print total number of M and F in the list
print("Total number of M: ", new_names_pred.tolist().count('M'))
print("Total number of F: ", new_names_pred.tolist().count('F'))

# Encoding Categorical Data

### For OrderReport dataset

In [None]:
lb = LabelBinarizer()
df_or['Customer type enc'] = lb.fit_transform(df_or['Customer type'])
df_or['Gender_enc'] = lb.fit_transform(df_or['Gender'])
print(df_or.head())

le = LabelEncoder()
df_or['State_enc'] = le.fit_transform(df_or['State'])
print(df_or.head())

### For ProductList dataset

In [None]:
lb = LabelBinarizer()
df_pl['Status enc'] = lb.fit_transform(df_pl['Status'])
print(df_pl.head())

# Data Visualization

In [None]:
# Visualize the distribution of the 'State' column in the OrderReport dataset

x = df_or['State'].value_counts().index
y = df_or['State'].value_counts().values

plt.figure(figsize=(10, 5))
plt.bar(x, y)
plt.title('State Distribution')
plt.xlabel('State')
plt.xticks(rotation=90)
plt.ylabel('Count')
plt.show()


In [None]:
x = df_or['Age'].value_counts().index
y = df_or['Age'].value_counts().values

plt.figure(figsize=(10, 5))
plt.bar(x, y)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()


In [None]:
# Pie Chart for Customer Types:

x = df_or['Customer type'].value_counts().index
y = df_or['Customer type'].value_counts().values

plt.figure(figsize=(10, 5))
plt.pie(y, labels=x, autopct='%1.1f%%')
plt.title('Customer Type Distribution')
plt.show()

In [None]:
# Scatter Plot for Items Sold vs. Revenue:
# Generate a scatter plot with "Items sold" on the x-axis and "N. Revenue" on the y-axis. 

x = df_or['Items sold']
y = df_or['N. Revenue']

plt.figure(figsize=(10, 5))
plt.scatter(x, y)
plt.title('Items Sold vs. Revenue')
plt.xlabel('Items Sold')
plt.ylabel('N. Revenue')
plt.show()
# Each point on the plot represents an order, and you can observe the relationship between the number of items sold and the revenue generated.
