In [3]:
import pandas as pd 
import vaex
import seaborn as sns
import pyodbc

# Data Import

In [4]:
transactions_df = vaex.open('transactions_train.csv')
customers_df = vaex.open('customers.csv')
articles_df = vaex.open('articles.csv')

In [5]:
transactions_df.head(5)

#,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad...,663713001,0.0508305,2
1,2018-09-20,'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad...,541518023,0.0304915,2
2,2018-09-20,'00007d2de826758b65a93dd24ce629ed66842531df66993...,505221004,0.0152373,2
3,2018-09-20,'00007d2de826758b65a93dd24ce629ed66842531df66993...,685687003,0.0169322,2
4,2018-09-20,'00007d2de826758b65a93dd24ce629ed66842531df66993...,685687004,0.0169322,2


In [7]:
customers_df.head(5)

#,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,'00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d...,,,ACTIVE,NONE,49,'52043ee2162cf5aa7ee79974281641c6f11a68d276429a9...
1,'0000423b00ade91418cceaf3b26c6af3dd342b51fd051ee...,,,ACTIVE,NONE,25,'2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93f...
2,'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad...,,,ACTIVE,NONE,24,'64f17e6a330a85798e4998f62d0930d14db8db1c054af6c...
3,'00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c...,,,ACTIVE,NONE,54,'5d36574f52495e81f019b680c843c443bd343d5ca5b1c22...
4,'00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe...,1.0,1.0,ACTIVE,Regularly,52,'25fa5ddee9aac01b35208d01736e57942317d756b32ddd4...


In [8]:
cnxn_str = ("Driver={SQL Server Native Client 11.0};"
            "Server=.\sqlexpress;"
            "Database=h_m;"
            "Trusted_Connection=yes;")

cnxn = pyodbc.connect(cnxn_str)
cursor = cnxn.cursor()


query = pd.read_sql(
"""

;with p as (
SELECT distinct product_code,prod_name
 FROM [h_m].[dbo].[articles]),

d as (
select p.product_code,count(*) cnt
from p 
group by p.product_code
having count(*) >1),

s as (
select product_code
from p
group by product_code
having count(*) = 1
)

select x.product_code,x.prod_name
from(
select p.*, ROW_NUMBER() over (partition by p.product_code order by p.product_code desc) rn
from d
join p
on d.product_code = p.product_code
)x
where x.rn = 1
union 
select p.*
from s
join p
on s.product_code = p.product_code""", cnxn)

product = vaex.from_pandas(query)

In [9]:
articles_df.head(3)

#,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


# Data Cleaning

In [None]:
transactions_df.rename('t_dat', 'date')

In [10]:
articles_df = articles_df.drop(columns=['product_type_no', 'graphical_appearance_no','colour_group_code','perceived_colour_value_id','perceived_colour_master_id','department_no','section_no','garment_group_no','index_group_no','index_code','prod_name'])

In [11]:
articles_df.rename('product_code', 'prod_id')

'prod_id'

In [12]:
product['product_code']=product.product_code.astype('int64')

In [13]:
articles_df = product.join(articles_df, how='inner', left_on ='product_code', right_on='prod_id',allow_duplication=True)

In [14]:
articles_df = articles_df.drop(columns=['prod_id','product_code'])

In [15]:
articles_df.head(3)

#,prod_name,article_id,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc
0,Strap top,108775015,Vest top,Garment Upper body,Solid,Black,Dark,Black,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
1,3P TANKTOP BODY,146706001,Bodysuit,Garment Upper body,Solid,White,Light,White,Baby basics,Baby Sizes 50-98,Baby/Children,Baby Essentials & Complements,Jersey Basic,'Sleeveless bodysuits in soft organic cotton jer...
2,Anton sport pant,156610001,Trousers,Garment Lower body,Solid,Black,Dark,Black,Men Sport Woven,Sport,Sport,Men H&M Sport,Jersey Fancy,'Sports trousers in fast-drying fabric made from...


In [17]:
df = transactions_df.join(articles_df, how='inner', left_on ='article_id',right_on='article_id')

In [18]:
df = df.join(customers_df, how='inner', left_on ='customer_id', right_on='customer_id')

In [19]:
df['date'] = df['date'].astype('datetime64')
df = df.drop(columns=(['postal_code']))

# Exploratory Data Analysis

In [20]:
print('transactions',transactions_df.shape)
print('customers',customers_df.shape)
print('articles',articles_df.shape)
print('main',df.shape)

transactions (31788324, 5)
customers (1371980, 7)
articles (105542, 14)
main (31788324, 23)
