In [None]:
import cudf
import cupy as cp
import cuml

# **Loading dataset**

In [None]:
trans_train = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
trans_train.head()

In [None]:
cus_train= cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
cus_train.head()

In [None]:
arti_train=cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
arti_train.head()

# **DATA PREPROCESSING**  
Removed unnecessary columns from training dataset,reducing memory consumption by storing article id as int32 and reducing the memory consumption of training dataset by storing customer id as int64 instead of string 

In [None]:
trans_train['customer_id'] = trans_train['customer_id'].str[-16:].str.hex_to_int().astype('int64')
trans_train['article_id'] = trans_train.article_id.astype('int32')
trans_train.t_dat = cudf.to_datetime(trans_train.t_dat)
trans_train = trans_train[['t_dat','customer_id','article_id']]
trans_train_original = trans_train
print( trans_train.shape )
trans_train.head()

In [None]:
cus_train = cus_train[['customer_id','age']]
cus_train['customer_id'] = cus_train['customer_id'].str[-16:].str.hex_to_int().astype('int64')
cus_train.head()

In [None]:
arti_train=arti_train[['article_id','product_type_no','graphical_appearance_no','colour_group_code']]
arti_train['article_id'] = arti_train.article_id.astype('int32')
arti_train.head()

# **Items purchased  within 2 weeks**

In [None]:
pucr = trans_train.groupby('customer_id').t_dat.max().reset_index()
pucr.columns = ['customer_id','max_dat']


In [None]:
pucr

In [None]:
trans_train = trans_train.merge(pucr,on=['customer_id'],how='left')
trans_train['diff_dat'] = (trans_train.max_dat - trans_train.t_dat).dt.days
trans_train = trans_train.loc[trans_train['diff_dat']<=14]

In [None]:
trans_train['diff_dat'].unique()

# **sorting by most recent date**

In [None]:
pucr = trans_train.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index() 
pucr.columns = ['customer_id','article_id','count']


In [None]:
pucr

In [None]:
trans_train = trans_train.merge(pucr,on=['customer_id','article_id'],how='left')
trans_train = trans_train.sort_values(['count','t_dat'],ascending=False)


In [None]:
trans_train

In [None]:
trans_train = trans_train.drop_duplicates(['customer_id','article_id'])
trans_train = trans_train.sort_values(['count','t_dat'],ascending=False)
trans_train= trans_train.reset_index(drop=True)


In [None]:
trans_train

In [None]:
trans_train=trans_train.reset_index(drop=False)
trans_train

# **Recommendation according to age of customer**

In [None]:
cust_age=cudf.merge(trans_train, cus_train, on='customer_id')
cust_age.head()

In [None]:
cust_age=cust_age[['index','customer_id','age','article_id']]
cust_age=cust_age.fillna({'age':18})


In [None]:
cust_age

In [None]:
art_sel = cudf.merge(cust_age, arti_train, on='article_id')

In [None]:
art_sel

In [None]:
output1=art_sel[['age','product_type_no','graphical_appearance_no','colour_group_code']]
output1.head()

In [None]:
output2=art_sel[['article_id']]
output2.head()

# KNN 
Taking k value equal to 13 and using euclidean distance

In [None]:
from cuml.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=13,metric='euclidean')

In [None]:
knn.fit(output1[:75000], output2[:75000])

In [None]:
ans=knn.predict(output1[:50000])

In [None]:
y_actual = output2[:50000]

In [None]:
import numpy as np

# Calculating R2 sccore

In [None]:
print(type(y_actual))
type(y_actual['article_id'][0])

In [None]:
y1=y_actual['article_id'].values

In [None]:
y1=y1.astype(float)

In [None]:
ans=ans.astype(float)

In [None]:
type(y1)

In [None]:
y1.flatten()

In [None]:
y1=y1.astype(cp.int_)

In [None]:
y1.shape

In [None]:
y1.reshape(len(y1),1)

In [None]:
ans

In [None]:
ans1=ans.astype(cp.int_)

In [None]:
# data=np.concatenate((ans.reshape(len(ans), 1), output2[]))
from sklearn.metrics import confusion_matrix

In [None]:
type(ans)

In [None]:
ans.shape

In [None]:
type(y1)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
score=accuracy_score(ans,y1)

In [None]:
final=cuml.metrics.r2_score( ans, y1 )

In [None]:
final

# Submission File

In [None]:
trans_train.article_id = ' 0' + trans_train.article_id.astype('str')
trans_train

In [None]:
p_trans_train = trans_train[['customer_id','article_id']].to_pandas() 
p_trans_train

In [None]:
purc = p_trans_train.groupby('customer_id').sum().reset_index()
purc.columns = ['customer_id','prediction']
trans_train=cudf.DataFrame(purc)

In [None]:
trans_train

In [None]:
trans_train.rename(columns={'customer_id':'customer_id_edited'},inplace=True)
trans_train

In [None]:
submission = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
submission = submission[['customer_id']]
submission['customer_id_edited'] = submission['customer_id'].str[-16:].str.hex_to_int().astype('int64')
submission = submission.merge(trans_train, on='customer_id_edited', how='left').fillna('')
del submission['customer_id_edited']
submission


In [None]:
submission.prediction = submission.prediction.str.strip()
submission.prediction = submission.prediction.str[:131]
submission.to_csv('submission.csv',index=False)
submission.head()