In [2]:
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout

In [None]:
data = pd.read_json('tops_fashion.json')

In [None]:
print ('Number of data points : ', data.shape[0], \
       'Number of features/variables:', data.shape[1])

In [None]:
data.columns

### we are choosing seven features out of 19 features in dataset

1. asin  ( Amazon standard identification number)
2. brand ( brand to which the product belongs to )
3. color ( Color information of apparel, it can contain many colors as   a value ex: red and black stripes ) 
4. product_type_name (type of the apperal, ex: SHIRT/TSHIRT )
5. medium_image_url  ( url of the image )
6. title (title of the product.)
7. formatted_price (price of the product)

In [None]:
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

In [None]:
print ('Number of data points : ', data.shape[0], \
       'Number of features:', data.shape[1])
data.head()


### 1 Data Cleaning
### 1.0 missing data 

1.1)product_type_name

In [None]:
print(data['product_type_name'].describe())

In [None]:
print(data['product_type_name'].unique())

In [None]:
product_type_count = Counter(list(data['product_type_name']))
product_type_count.most_common(10)

1.2)brand

In [None]:
print(data['brand'].describe())

In [None]:
print(data['brand'].unique())

In [None]:
brand_count = Counter(list(data['brand']))
brand_count.most_common(10)

1.3)color

In [None]:
print(data['color'].describe())

In [None]:
print(data['color'].unique())

In [None]:
color_count = Counter(list(data['color']))
color_count.most_common(10)

1.4)formatted_price

In [None]:
print(data['formatted_price'].describe())

In [None]:
formatted_price_count = Counter(list(data['formatted_price']))
formatted_price_count.most_common(10)

1.5)title

In [None]:
print(data['title'].describe())

In [None]:
title_count = Counter(list(data['title']))
title_count.most_common(10)

### 1.1 reducing data 

In [None]:
data.to_pickle('pickels/180k_apparel_data')

In [None]:
data = data.loc[~data['formatted_price'].isnull()]
print('Number of data points After eliminating price=NULL :', data.shape[0])

In [None]:
data =data.loc[~data['color'].isnull()]
print('Number of data points After eliminating color=NULL :', data.shape[0])

In [None]:
data.to_pickle('pickels/28k_apparel_data')

### 1.2 remove duplicates 

In [None]:
data = pd.read_pickle('pickels/28k_apparel_data')
print(sum(data.duplicated('title')))

In [None]:
data.head()

In [None]:
data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]
print("After removal of products with short description:", data_sorted.shape[0])

In [None]:
data_sorted.sort_values('title',inplace=True,ascending=False)
data_sorted.head()

In [None]:
indices = []
for i,row in data_sorted.iterrows():
    indices.append(i)


In [None]:
import itertools
stage1_dedupe_asins = []
i=0
j=0
num_data_points = data_sorted.shape[0]
while i < num_data_points and j < num_data_points:
    previous_i = i 
    a = data['title'].loc[indices[i]].split()
    j = i+1
    while j < num_data_points:
        b = data['title'].loc[indices[j]].split()
        length = max(len(a),len(b))
        count = 0
        for k in itertools.zip_longest(a,b):
            if(k[0] == k[1]):
                count += 1 
            if(length-count)>2:
                stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])
                i=j
            else:
                j+=1
        if previous_i == i:
            break
            
                    
            

In [None]:
data = data.loc[data['asin'].isin(stage1_dedupe_asins)]

In [None]:
print('Number of data points : ', data.shape[0])
data.to_pickle('pickels/17k_apperal_data')

In [None]:
data = pd.read_pickle('pickels/17k_apperal_data')

In [None]:
indices = []
for i,row in data.iterrows():
    indices.append(i)

stage2_dedupe_asins = []

In [None]:
while len(indices)!=0:
    i = indices.pop()
    stage2_dedupe_asins.append(data['asin'].loc[i])
    a = data['title'].loc[i].split()
    for j in indices:
        b = data['title'].loc[j].split()
        length = max(len(a),len(b))
        count  = 0
        for k in itertools.zip_longest(a,b): 
            if (k[0]==k[1]):
                count += 1
        if (length - count) < 3:
            indices.remove(j)

In [None]:
data = data.loc[data['asin'].isin(stage2_dedupe_asins)]
print('Number of data points after stage two of dedupe: ',data.shape[0])

In [None]:
data.to_pickle('pickels/16k_apperal_data')

### 2 Text pre-processing

In [3]:
data = pd.read_pickle('pickels/16k_apperal_data')