In [None]:
import pandas as pd

# # Investigating the data

In [None]:
data = pd.read_csv("../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv")
data.head()

In [None]:
data.shape

In [None]:
data.tail()

In [None]:
data.columns

# # Preprocssing data

In [None]:
# Dropping unncessary columns
data = data.drop(['Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23',
       'Unnamed: 24', 'Unnamed: 25'],axis=1)

In [None]:
data.shape

In [None]:
#Dropping empty rows
data = data.dropna(how = 'all')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data['status'].unique()

In [None]:
data['status'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
# Checking the most frequent value to fill NaN
data['category_name_1'].mode(),data['status'].mode()

In [None]:
# Filling NaN values
data['category_name_1'] = data['category_name_1'].fillna("Mobiles % Tablets")
data['status'] = data['status'].fillna("complete")

In [None]:
data.isnull().sum()

In [None]:
data['status']

In [None]:
data.dtypes

In [None]:
# Correcting data types of columns
data['Customer ID'] = data['Customer ID'].astype(str)
data['item_id'] = data['item_id'].astype(str)
data['qty_ordered'] = data['qty_ordered'].astype(int)  
data["Month"] = data["Month"].astype(int)
data["Year"] = data["Year"].astype(int)

In [None]:
data.dtypes

# # Visualizing payment method and order status frequency

In [None]:
# Order Frequcney
data['status'].value_counts().plot.bar(figsize=(12,8),title='Order Frequency Plot')

In [None]:
ord_2016 = data[data['Year']==2016]
ord_2016['status'].value_counts().plot.bar(figsize=(12,8),title='Orders in 2016')

In [None]:
ord_2017 = data[data['Year']==2017]
ord_2017['status'].value_counts().plot.bar(figsize=(12,8),title='Orders in 2017')

In [None]:
ord_2018 = data[data['Year']==2018]
ord_2018['status'].value_counts().plot.bar(figsize=(12,8),title='Orders in 2018')

In [None]:
# best selling category
data['category_name_1'].value_counts().plot.bar(figsize=(12,8),title ="Plotting the number of orders categories ")

# # Finding a correlation between payment method and order status

In [None]:
# Best Payment Method
data['payment_method'].value_counts().plot.bar(figsize=(12,8),title='Order Frquency plot w.r.t Payment Method')

In [None]:
data_comp = data[data['status']=='complete'] 
data_comp['payment_method'].value_counts().plot.bar(figsize=(12,8),title='Complete Order')

In [None]:
data_rec = data[data['status']=='received'] 
data_rec['payment_method'].value_counts().plot.bar(figsize=(12,8),title='Recieved Order')

In [None]:
data_can = data[data['status']=='canceled'] 
data_can['payment_method'].value_counts().plot.bar(figsize=(12,8),title='Cancelled Order')

In [None]:
data_ref = data[data['status']=='order_refunded'] 
data_ref['payment_method'].value_counts().plot.bar(figsize=(12,8),title='Refunded Order')

In [None]:
# encdoing categorical columns; payment method and status to numbers to compute corelation between two
data['payment_encoded'] = data['payment_method'].astype("category").cat.codes
data['status_encoder']= data['status'].astype("category").cat.codes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cor = data.corr()
plt.figure(figsize = (12,8))
sns.heatmap(cor,annot=True)

# Finding a correlation between order date and item category

In [None]:
# encdoing categorical columns; payment method and status to numbers to compute corelation between two
data['category_corr'] = data['category_name_1'].astype("category").cat.codes
data['order_date_corr'] = data['created_at'].astype("category").cat.codes
cor = data.corr()
plt.figure(figsize = (12,8))
sns.heatmap(cor,annot=True)

# Can we predict number of orders, or item category or number of customers/amount in advance?

In [None]:
# Installing Pre-requiste libraires to read an Excel file
!pip install xlrd
!pip install openpyxl

In [None]:
import pandas as pd
df = pd.read_excel("../input/ecommerce-pakistan-for-cl/New_data.xlsx")
df.head()

In [None]:
df['Status'].unique()

In [None]:
df['payment_method'].unique()

In [None]:
df.shape

In [None]:
df[df['Status']=='\\0']

In [None]:
# Dropping disturing rows
df = df.drop(labels=[255520,255521,255624,255625],axis=0)

In [None]:
df['category_name_1'].value_counts()

In [None]:
df

In [None]:
# Initilizing encoder to transform categorical columns into numbers
from sklearn.preprocessing import LabelEncoder
encoder= LabelEncoder()

In [None]:
df['category_name_1'] = encoder.fit_transform(df['category_name_1'])

In [None]:
df['payment_method'] = encoder.fit_transform(df['payment_method'])

In [None]:
df

In [None]:
# Filling NaN status values
df['Status'] = df['Status'].fillna(1)

In [None]:
df['Status'].unique()

In [None]:
df['category_name_1'].unique()

In [None]:
df['payment_method'].unique()

In [None]:
# Seperating data and labels
X, y = df.loc[:, df.columns!="Status"],df.Status

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape,X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
print("Random Forest Acc :" ,rf.score(X_test,y_test))

kn = KNeighborsClassifier()
kn.fit(X_train,y_train)
print("KNN Acc :",kn.score(X_test,y_test))

cl = LogisticRegression()
cl.fit(X_train,y_train)
print('Log Reg Acc :',cl.score(X_test,y_test))

In [None]:
print("Predictions of Log Reg ",y_test[0:10],cl.predict(X_test[0:10]))
print("Predictions of Ran For ",y_test[0:10],rf.predict(X_test[0:10]))
print("Predictions of KNN ",y_test[0:10],kn.predict(X_test[0:10]))

In [None]:
# Normalizing the price columns
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler([0,1])
scale.fit(X_train[['price']])
X_train['price'] = scale.transform(X_train[['price']])
X_test['price'] = scale.transform(X_test[['price']])
X_train

In [None]:
print("Results after Normalizing the data")

rf = RandomForestClassifier()
rf.fit(X_train,y_train)
print("Random Forest Acc :" ,rf.score(X_test,y_test))

kn = KNeighborsClassifier()
kn.fit(X_train,y_train)
print("KNN Acc :",kn.score(X_test,y_test))

cl = LogisticRegression()
cl.fit(X_train,y_train)
print('Log Reg Acc :',cl.score(X_test,y_test))

In [None]:
from sklearn.inspection import permutation_importance
model = cl.fit(X,y)
results = permutation_importance(model,X,y,n_repeats=10)
for i in results.importances_mean.argsort():
    print(X.columns[i],results.importances_mean[i])

In [None]:
import matplotlib.pyplot as plt
features = []
im_mean = []

for i in results.importances_mean.argsort():
    features.append(i)
    im_mean.append(results.importances_mean[i])

plt.xlabel('Features')
plt.ylabel("Imporance mean")
plt.bar(range(len(im_mean)),im_mean)
plt.xticks(range(len(features)),X.columns[features])