In [44]:
'''
Webscraping using Python
This scrapes Forever21's website to access the category name, display name, whether
it is a final sale, the product id, and the list price of some dresses, tops, and bottoms.
Then, it takes this information and converts it to a csv using pandas.

Then, we experiment with Machine Learning packages (such as sklearn) to build a classifier
that predicts what kind of category (dress, top, or bottom) a certain item should be, 
based on whether its a final sale, its product ID, and its listed price.

rittika2
sohams2
'''

import requests
from bs4 import BeautifulSoup
import json
import re

# returns the json containing information about clothes in seed url
def get_clothes_json(seed_url):
    # gets the html document for the seed url
    html_doc = requests.get(seed_url).text
    soup = BeautifulSoup(html_doc, "html5lib")
    
    # finds everything in the script tag
    json_file = soup.find_all('script')
    
    # finds the "correct" script tag with the json about the clothes
    save_json = ""
    for json in json_file:
        if (json.string != None):
            if ('Loading Category' in json.string):
                save_json = json.string

    # gets the json inside the script tag
    jsonValue = '{%s}' % (save_json.split('{', 1)[1].rsplit('}', 1)[0],)
    jsonValue = re.split('[$]', jsonValue)[0]
    return jsonValue

# probably could have used a separate method to generate the seed_urls for the above function...
# get jsons for dresses
dress_1_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/dress')
dress_2_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/dress#pageno=2&pageSize=120&filter=price:0,250')
dress_3_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/dress#pageno=3&pageSize=120&filter=price:0,250')
dress_4_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/dress#pageno=4&pageSize=120&filter=price:0,250')
dress_5_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/dress#pageno=5&pageSize=120&filter=price:0,250')

# get jsons for tops
tops_1_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/top_blouses')
tops_2_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/top_blouses#pageno=2&pageSize=120&filter=price:0,250')
tops_3_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/top_blouses#pageno=3&pageSize=120&filter=price:0,250')
tops_4_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/top_blouses#pageno=4&pageSize=120&filter=price:0,250')
tops_5_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/top_blouses#pageno=5&pageSize=120&filter=price:0,250')

# get jsons for bottoms
bottoms_1_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/bottoms')
bottoms_2_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/bottoms#pageno=2&pageSize=120&filter=price:0,250')
bottoms_3_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/bottoms#pageno=3&pageSize=120&filter=price:0,250')
bottoms_4_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/bottoms#pageno=4&pageSize=120&filter=price:0,250')
bottoms_5_json = get_clothes_json('https://www.forever21.com/us/shop/Catalog/Category/f21/bottoms#pageno=5&pageSize=120&filter=price:0,250')

In [45]:
import pandas as pd

# converts a given json file to a pandas dataframe
def convert_to_df(json_file):
    
    # splits the json file to help parse the json string
    json_file = re.split("[:,]", json_file)
    
    # lists of the attributes to be tracked
    currIdx = 0
    category_names = list()
    display_names = list()
    final_sales = list()
    product_id = list()
    list_prices = list()
    
    # iterates through the json string to find the attributes
    # appends the list if the next element is not empty
    for word in json_file:
        if '"CategoryName"' == word:
            if not json_file[currIdx+1] == '""':
                category_names.append(json_file[currIdx+1])
        if '"DisplayName"' == word: 
            if not json_file[currIdx+1] == '""':
                display_names.append(json_file[currIdx+1])
        if '"FinalSale"' == word:
            if not json_file[currIdx+1] == '""':
                final_sales.append(json_file[currIdx+1])
        if '"ProductId"' == word:
            if not json_file[currIdx+1] == '""':
                product_id.append(json_file[currIdx+1])
        if '"ListPrice"' == word: 
            # a hacky fix to avoid double/triple counting the list price... 
            if "AM" in json_file[currIdx-1] or "PM" in json_file[currIdx-1]:
                list_prices.append(json_file[currIdx+1])
        currIdx+=1
        
    # a hacky fix to even out the # of categories
    del category_names[0]
    
    # creating proper generic labels for categories
    if '"bottoms"' in category_names:
        category_names = ['"bottoms"' for x in category_names]
    if '"top_blouses"' in category_names:
        category_names = ['"tops"' for x in category_names]
    if '"dress"' in category_names:
        category_names = ['"dress"' for x in category_names]
    
    
    # data preprocessing: creating rows from the lists created above
    rows = list()
    for i in range(len(category_names)):
        row = [category_names[i].strip('"'), display_names[i].strip('"'), final_sales[i].strip('"'), 
               product_id[i].strip('"'), list_prices[i].strip('"')]
        rows.append(row)  
    
    # creating a pandas dataframe from the rows
    df = pd.DataFrame(rows)
    
    # adding headers to dataframe
    df.columns = ["Category Name", "Display Name", "Final Sale", "Product ID", "List Price"]
    return df

# converting the dress jsons to a pandas df
df_dress_1 = convert_to_df(dress_1_json)
df_dress_2 = convert_to_df(dress_2_json)
df_dress_3 = convert_to_df(dress_3_json)
df_dress_4 = convert_to_df(dress_4_json)
df_dress_5 = convert_to_df(dress_5_json)

# converting the tops jsons to a pandas df
df_tops_1 = convert_to_df(tops_1_json)
df_tops_2 = convert_to_df(tops_2_json)
df_tops_3 = convert_to_df(tops_3_json)
df_tops_4 = convert_to_df(tops_4_json)
df_tops_5 = convert_to_df(tops_5_json)

# converting the bottoms jsons to a pandas df
df_bottoms_1 = convert_to_df(bottoms_1_json)
df_bottoms_2 = convert_to_df(bottoms_2_json)
df_bottoms_3 = convert_to_df(bottoms_3_json)
df_bottoms_4 = convert_to_df(bottoms_4_json)
df_bottoms_5 = convert_to_df(bottoms_5_json)

# creating a list of dataframes
df_list = [df_dress_1, df_dress_2, df_dress_3, df_dress_4, df_dress_5,
          df_tops_1, df_tops_2, df_tops_3, df_tops_4, df_tops_5,
          df_bottoms_1, df_bottoms_2, df_bottoms_3, df_bottoms_4, df_bottoms_5]

In [46]:
# converts a list of pandas dataframes to a csv
# and writes out the file to a given outfile
def convert_to_csv(df_list, outfile):
    # concatenates all of the dataframes
    result = pd.concat(df_list)
    
    # converts the result to a csv
    result.to_csv(outfile, index=False)

convert_to_csv(df_list, 'out')

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


# This section uses a RandomForestClassifier (from sklearn) to "classify" the test data 
# (which we got from splitting the training data) into top, bottoms, and dress.
# Then, it prints the RandomForestClassifier's accuracy on the training data and test data, 
# and prints out a confusion matrix. 
 
# data pre-processing
df = pd.read_csv(
    filepath_or_buffer='out', 
    header=None, 
    sep=',')

headers = list(df.iloc[0])
df.columns = headers
df = df[1:]
del df['Display Name']
del headers[1]


# preprocessing categorical data
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df[headers[0]] = enc.fit_transform(df[headers[0]])
df[headers[1]] = enc.fit_transform(df[headers[1]])

# splitting training data
train_x, test_x, train_y, test_y = train_test_split(df[headers[1:len(headers)]], df[headers[0]], 
                                                    train_size=0.70, test_size=0.30)

# training random forest classifier
clf = RandomForestClassifier(n_estimators = 1)
clf.fit(train_x, train_y)


# computing accuracies and confusion matrix
print("Train Accuracy: ", accuracy_score(train_y, clf.predict(train_x), normalize=False)/len(train_y))
print("Test Accuracy: ", accuracy_score(test_y, clf.predict(test_x), normalize=False)/len(test_y))
print("Confusion Matrix on Training Data:\n", confusion_matrix(train_y, clf.predict(train_x)))
print("Confusion Matrix on Test Data:\n", confusion_matrix(test_y, clf.predict(test_x)))

Train Accuracy:  0.938095238095
Test Accuracy:  0.885185185185
Confusion Matrix on Training Data:
 [[400  13  17]
 [ 16 391  14]
 [  5  13 391]]
Confusion Matrix on Test Data:
 [[145  12  13]
 [  9 154  16]
 [  5   7 179]]
