# IMPORTING LIBRARIES

In [None]:
# importing necessary Python libraries
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import os
from PIL import Image
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt
from itertools import chain
import plotly.graph_objs as go 
#import plotly.figure_factory as ff

# avoid displaying warnings
import warnings
warnings.filterwarnings("ignore")

#import machine learning related libraries
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
import xgboost as xgb
import time 


# IMPORTING THE DATAFRAMES 

In [None]:

# import customer.csv , transactions_train.csv, articles.csv, sample_submission.csv from https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data

df_customers = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv')
df_transactions = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
df_articles = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv')
df_sample_submission = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

# EDA

## $\color{red}{\text{1. Customers table  Exploration}}$ 

In [None]:

df_customers.info()
print('-------------------------------')
print('-------------------------------')
df_customers.shape[0] - df_customers['customer_id'].nunique()
print("Duplicate values:",df_customers.shape[0] - df_customers['customer_id'].nunique())

#### Customers data description:

customer_id : A unique identifier of every customer<br>
FN : 1 or missed<br>
Active : 1 or missed<br>
club_member_status : Status in club<br>
fashion_news_frequency : How often H&M may send news to customer<br>
age : The current age<br>
postal_code : Postal code of customer<br>

#### We can see there are some null values in customers columns: 'FN','age','Active','club_member_status','fashion_news_frequency'
#### And column 'fashion_news_frequency' has 2 'None' values instead of 'NONE'
#### There are no duplicate values in customers



## Lets Explore some insights about customers 




#### i. Number of Customers per each Age
The most common age is about 21-23



In [None]:
def pie_chart(df, col_values, labels, ax, color, title):
    n_classes = len(df)
    explode = (0.5,) * n_classes # explode for 0.1 each slice
    ax.pie(df[col_values],
           colors=color, 
           explode=explode,
           labels=df[labels],
           shadow=True)
    ax.set_title(title, fontsize=16)
    
def bar_plot(df, col_x, col_y, ax, color, title):
    ax.bar(x=df[col_x],
           height=df[col_y],
           color=color)
    ax.set_title(title, fontsize=16) 
    plt.xticks(rotation=90)
    
temp = df_customers.groupby(["age"])["customer_id"].count()
df = pd.DataFrame({'Age': temp.index,'Customers': temp.values})
df = df.sort_values(['Age'], ascending=False)


fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10,6))
color = plt.cm.cool(np.linspace(0, 1, len(df)))

bar_plot(df,
         'Age',
         'Customers',
         axes, 
         color, 
         "Number of Customers by age")


#### ii. Customer Status in H&M club. 
Almost every customer has an active club status, some of them begin to activate it (pre-create). A tiny part of customers abandoned the club.

In [None]:
sns.set_style("darkgrid")
f, ax = plt.subplots(figsize=(10,5))
ax = sns.histplot(data=df_customers, x='club_member_status', color='blue')
ax.set_xlabel('Distribution of club member status')
plt.show()

## Cleaning the customers table
#### Replacing age NaN values with 0


In [None]:
df_customers = df_customers.dropna(subset=['age']) 
df_customers.isna().sum()

## Counting the 12 best customers

In [None]:
customers_12 = df_transactions.customer_id.value_counts()

cutomers who bought more than 12 items

In [None]:
customers_12 > 12 
customers_12.index[customers_12 > 12] #

In [None]:
# df with list of customer with more than 12 purchases
customer_best = customers_12.index[customers_12 > 12] 
customer_best

In [None]:
#Taking 10000 rows for sample data undertaking
customer_sample = df_customers[df_customers['customer_id'].isin(customer_best)].sample(n=10000, frac=None, replace=False, weights=None, random_state=1)


## $\color{red}{\text{2. Articles table  Exploration }}$ 

In [None]:
df_articles.info()

#### This table contains all h&m articles with details such as a type of product, a color, a product group and other features.
Article data description:

article_id : A unique identifier of every article.</br>
product_code, prod_name : A unique identifier of every product and its name (not the same).</br>
product_type, product_type_name : The group of product_code and its name</br>
graphical_appearance_no, graphical_appearance_name : The group of graphics and its name</br>
colour_group_code, colour_group_name : The group of color and its name</br>
perceived_colour_value_id, perceived_colour_value_name, perceived_colour_master_id, perceived_colour_master_name : The added color info</br>
department_no, department_name: : A unique identifier of every dep and its name</br>
index_code, index_name: : A unique identifier of every index and its name</br>
index_group_no, index_group_name: : A group of indeces and its name</br>
section_no, section_name: : A unique identifier of every section and its name</br>
garment_group_no, garment_group_name: : A unique identifier of every garment and its name</br>
detail_desc: : Details</br>

#### Some of the columns have -1 values probably referring to missing data

In [None]:
print("columns having -1 values: \n")
cols_missing_value=[]
for i in df_articles.columns:
    if (-1 in df_articles[i].value_counts()):
        cols_missing_value.append(i)
print(cols_missing_value)       

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])



In [None]:
missing_data(df_articles).head(7).style.set_properties(**{'background-color': 'rgba(245, 181, 152,.5)'})

### -1 value in all code columns refer to the 'Unknown' category. Therefore keeping -1 values as it is

## Lets Explore some insights about Articles table 


#### i. Ladieswear accounts for a significant part of all dresses. Sportswear has the least portion.



In [None]:
f, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=df_articles, y='index_name', color='blue')
ax.set_xlabel('count by index name')
ax.set_ylabel('index name')
plt.show()

#### ii. Lets see what ladies buys the most 

The garments grouped by index: As we can see Jersey fancy is the most frequent garment, especially for women and children. The next by number is accessories, many various accessories with low price.



In [None]:
f, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=df_articles, y='garment_group_name', color='blue', hue='index_group_name')
ax.set_xlabel('count by garment group')
ax.set_ylabel('garment group')
plt.show()

# FEATURE SELECTION & FEATURE ENGINEERING

#### Data splitting is among the most critical steps before preprocessing. It is possible to cause data leakage by dividing the data before processing it, or to overestimate the model evaluation when splitting the data. But Splitting data prior to processing has the huge advantage of ensuring consistency in model performance because unseen data are processed in the same manner as test data.

In [None]:
#Split customer sample data (10000 rows) in test and train dataframe
customer_train, customer_test, = train_test_split(customer_sample, test_size=0.3)

In [None]:
customer_train.head()
customer_test.shape 
customer_train.shape 

### Data frame df_articles has been cleaned up by removing extra columns. We need to drop a couple of columns from the df_articles data frame since it is extremely large, and we won't be evaluating the machine learning model with those columns.

In [None]:
# remove unwanted columns from articles df and place it into a new df
clean_df_articles = df_articles.drop(columns=['prod_name', 'product_type_name', 'graphical_appearance_name',
                                       'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 
                                'perceived_colour_value_id', 'perceived_colour_master_id',
                                       'index_name', 'index_group_name', 'section_name', 'garment_group_name'])

In [None]:
clean_df_articles.info()

##### From the above cell, we can see that unwated columns have been removed. Additionally, we convert the data types for the article column to categorical to reduce processing time.

In [None]:
clean_df_articles['product_code'] = clean_df_articles.product_code.astype('category')
clean_df_articles['product_type_no'] = clean_df_articles.product_type_no.astype('category')
clean_df_articles['product_group_name'] = clean_df_articles.product_group_name.astype('category')
clean_df_articles['graphical_appearance_no'] = clean_df_articles.graphical_appearance_no.astype('category')
clean_df_articles['colour_group_code'] = clean_df_articles.colour_group_code.astype('category')
clean_df_articles['department_no'] = clean_df_articles.department_no.astype('category')
clean_df_articles['index_code'] = clean_df_articles.product_type_no.astype('category')
clean_df_articles['index_group_no'] = clean_df_articles.index_group_no.astype('category')
clean_df_articles['section_no'] = clean_df_articles.section_no.astype('category')
clean_df_articles['garment_group_no'] = clean_df_articles.garment_group_no.astype('category')
clean_df_articles['detail_desc'] = clean_df_articles.detail_desc.astype('category')

In [None]:
clean_df_articles.info()

## Merge df_article and df_transastion table into a new df called article_transactions

In [None]:
article_transactions = df_transactions.merge(clean_df_articles, on='article_id', how='left')

In [None]:
article_transactions.info()

In [None]:
article_transactions.head(2)

## Merge customer training and customer test dataframe which we created earlier with article_transactions df to create a proper Train & Test Df which we will use for our testing and prediction.

In [None]:
# Merge customer_train,test with article_transactions to get train & test dataframe
train = customer_train.merge(article_transactions, on='customer_id', how='inner') 
test = customer_test.merge(article_transactions, on='customer_id', how='inner')

In [None]:
train.info()
test.info()

In [None]:
train.head(-2)

## Convert Column t_dat in both train & test df to proper datetime format

In [None]:
#Convert Column t_dat in both train & test df to proper datetime format
train.t_dat = pd.to_datetime(train.t_dat)
test.t_dat = pd.to_datetime(test.t_dat)

In [None]:

# Sort the customer_id values from test & train df
train = train.sort_values(['customer_id', 't_dat'])
test = test.sort_values(['customer_id', 't_dat'])

## Get last 12 item buys of all the customers

In [None]:
train_group = train.groupby('customer_id', observed=True).tail(12).index #List of the 12 last index
test_group = test.groupby('customer_id', observed=True).tail(12).index

In [None]:
#replaces everything up to groupby in next cell
train.loc[train_group,:] 
test.loc[test_group,:]

## We drop unwanted columns from train and test to get dataframes with customers and their last 12 purchases only.
This will give us a y_truth value which states real purchases by customers.

In [None]:
#train.drop(index=train_group, inplace=True)
#test.drop(index=test_group, inplace=True)

In [None]:
y_train = train.loc[train_group,:].groupby('customer_id', observed=True)['article_id'].apply(lambda x: x.tolist())
y_test =  test.loc[test_group,:].groupby('customer_id', observed=True)['article_id'].apply(lambda x: x.tolist())

In [None]:
y_train.head(100)
y_test.head(100)

In [None]:
#create data frame for y_train & y_test to get value for just 1 customer
uno_y_train = y_train.apply(lambda x: x[1])
uno_y_test = y_test.apply(lambda x: x[1])

In [None]:
uno_y_train.head()

In [None]:
uno_y_test.head()

# Function to get X_train & x_test data frames. 
This function will helpfull to finding the features values which will be used for finding accuracy.


In [None]:
def function_features (customers):
        #here we will fetch all the required columns which we will use as features
        features_rows = {'FN' : customers['FN'].iloc[0],
                    'Active' : customers['Active'].iloc[0],
                    'club_member_status' : customers['club_member_status'].iloc[0],
                    'fashion_news_frequency' :customers['fashion_news_frequency'].iloc[0],
                    'age'  : customers['age'].iloc[0],
                    'postal_code' : customers['postal_code'].iloc[0]} 
        features_rows['bought_items'] = customers.shape[0] #
        return pd.Series(features_rows) 
        


Getting x_train & x_test

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

x_train = train.groupby('customer_id', observed=True).progress_apply(function_features) #Apply feat_gen to entire Full_train

x_test  = test.groupby('customer_id', observed=True).progress_apply(function_features) 

In [None]:
x_train.head()

In [None]:
x_test.head()

## Now we get the prediction column (y_prediction) for training data


In [None]:
train_prediction = train.groupby(["customer_id"])["article_id"].agg(lambda x: str(x.values[0:12])[1:-1]).reset_index()

Lets define a function to get prediction values by splitting all items values.

In [None]:
def articles_padding(x):
    if x:
        xl = x.split()
        x = []
        for xi in xl:
            x.append("0"+xi)
        dimm_x = len(x)
        if dimm_x < 12:
            x.extend(art_list[:12-dimm_x])
        return(" ".join(x))

In [None]:
train_prediction["article_id"] = train_prediction["article_id"].apply(lambda x: articles_padding(x))

Temporary predictied articles ( we will get proper prediction later.)

In [None]:
train_prediction.head()

# Now we get the prediction column (y_prediction) for test data


In [None]:
test_prediction = test.groupby(["customer_id"])["article_id"].agg(lambda x: str(x.values[0:12])[1:-1]).reset_index()

In [None]:
def articles_padding_test(x):
    if x:
        xl = x.split()
        x = []
        for xi in xl:
            x.append("0"+xi)
        dimm_x = len(x)
        if dimm_x < 12:
            x.extend(art_list[:12-dimm_x])
        return(" ".join(x))

In [None]:
test_prediction["article_id"] = test_prediction["article_id"].apply(lambda x: articles_padding_test(x))

In [None]:
test_prediction.head()

# Machine Learning Algorithm to Find accuracy and Predcition of the training and test models.

Before we proceed with ml part , we will need to convert our values in x_train & x_test to one hot encoding because currently we have all the data types as categorical which will be needed to converted to int or float to proceed with predcting accuracy

In [None]:
# remove na's in x_train
x_train.fillna(0)

In [None]:
# remove na's in x_test
x_test.fillna(0)

## LETS CONVERT 'FN', 'Active', 'club_member_status', 'fashion_news_frequency' columns (these are categorical columns) to binary using one hot encoding in x_train and x_test

In [None]:
# applying the one hot encoding  to our X train and  X test dataframe  for 'FN', 'Active', 'club_member_status', 'fashion_news_frequency' columns
x_train_encoded = pd.get_dummies(x_train, columns=['FN', 'Active', 'club_member_status', 'fashion_news_frequency'])
x_test_encoded = pd.get_dummies(x_test, columns=['FN', 'Active', 'club_member_status', 'fashion_news_frequency'])

In [None]:
#lets drop unwated column (postal_code, as it is not needed)
x_train_encoded.drop(['postal_code'], axis=1, inplace=True)
x_test_encoded.drop(['postal_code'], axis=1, inplace=True)

In [None]:

x_train_encoded.head()


In [None]:
x_test_encoded.head()

We can see that all categorical columns are now converted to binary 1's & 0's to make our job easier to get accuracy

## Now lets find accuracy using customers data which we did to get a value

# i. DECISION TREE

In [None]:

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [None]:
# Create Decision tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(x_train_encoded,uno_y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_train_encoded)

In [None]:
print(y_train)

In [None]:
print(y_pred)

# ** WE GET A ACCURACY OF 60 % using Decison Tree WHICH TELLS US THAT OUR PREDICTION WAS CORRECT**

---



In [None]:
print("Accuracy:",metrics.accuracy_score(uno_y_train, y_pred))

## ii. KNN Algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

X = x_train_encoded
y = uno_y_train

Knnc = KNeighborsClassifier(n_neighbors=5)
Knnc.fit(X,y)
y_pred = Knnc.predict(x_train_encoded)

print(accuracy_score(uno_y_train, y_pred)) 


# Lets Get single predictions  for all customer using following function

In [None]:
#allx = pd.concat([x_train_encoded, x_test_encoded]) 
#average_customer = allx.mean(axis=0).to_frame().T
#missing_new = df_customers['customer_id'][~df_customers['customer_id'].isin(allx.index)]
#submissions = clf.predict(allx)
#customer_ppred = clf.predict(average_customer)

In [None]:
#submissions = pd.DataFrame({'customer_id': allx.index, 'prediction' : submissions})
#submissions = pd.concat([submissions, pd.DataFrame({'customer_id': missing_new, 'prediction': np.repeat(customer_ppred, len(missing_new))})])

In [None]:
#submissions.sort_index(inplace=True)

# Lets Get 12 prediction for all customers using following function

In [None]:
pred_df = df_transactions.groupby(["customer_id"])["article_id"].agg(lambda x: str(x.values[0:12])[1:-1]).reset_index()

In [None]:
#find customers with purchases on last days
last_date = df_transactions.t_dat.max()
print(df_transactions.loc[df_transactions.t_dat==last_date].shape)


# find most frequent items
most_frequent_articles = list(df_transactions.loc[df_transactions.t_dat==last_date].article_id.value_counts()[0:12].index)
art_list = []
for art in most_frequent_articles:
    art = "0"+str(art)
    art_list.append(art)
art_str = " ".join(art_list)
print("Frequent articles bought recently:", art_str, end="\n")

In [None]:
def padding_articles_prediction(x):
    if x:
        xl = x.split()
        x = []
        for xi in xl:
            x.append("0"+xi)
        dimm_x = len(x)
        if dimm_x < 12:
            x.extend(art_list[:12-dimm_x])
        return(" ".join(x))

In [None]:
pred_df["article_id"] = pred_df["article_id"].apply(lambda x: padding_articles_prediction(x))


In [None]:
# replace sample submission files with our predicted values
df_submission = pred_df.merge(df_sample_submission[["customer_id"]], how="right")
df_submission.columns = ["customer_id", "prediction"]
df_submission.head().style.set_properties(**{'background-color': 'rgba(184,230,194,.5)'})

In [None]:
df_submission.to_csv('submission.csv',index=False)