# Webscrapping Shoes data from Jumia web page

---

In [None]:
#import requests and beautifulsoup
from bs4 import BeautifulSoup
import requests

In [None]:
#make a request for web page
#create a soup object
response=requests.get('https://www.jumia.ug/mens-shoes/').text
soup=BeautifulSoup(response,'lxml')

In [None]:
#obtain all article tags with the class 'prd _fb col c-prd' in a list
#iterate and view each article
for index, article in enumerate(soup.find_all('article',class_='prd _fb col c-prd')):
    print(f'article: {index+1}')
    print(article.prettify())
    print('\n\n')

In [None]:
#create a csv file to store the data 
import csv
csv_file=open('Mens_shoes.csv', 'w', encoding='utf-8', newline='')
csv_writer=csv.writer(csv_file)
#write header row in csv file
csv_writer.writerow(['Item','old price','new price','discount','rating'])

#extract data from article tags
for index, article in enumerate(soup.find_all('article',class_='prd _fb col c-prd')):
    try:
        #get item from article
        item=article.h3.text
        
        #get new price from article
        new_price=article.find('div', class_='prc')
        new_price=new_price.text if (new_price is not None) else None 
        
        #get old price from article
        old_price=article.find('div', class_='old')
        old_price=old_price.text if (old_price is not None) else None
        
        #get discount percentage from article
        pct_price_cut=article.find('div', class_='bdg _dsct _sm')
        pct_price_cut=pct_price_cut.text if (pct_price_cut is not None) else None
        
        #get rating from article
        stars=article.find('div', class_='stars _s')
        stars=stars.text if (stars is not None) else None
    #catch all exceptions and print message
    except Exception as e:
        print(f"Error: {e}")
    #write new row into csv file using extracted data
    csv_writer.writerow([item, old_price, new_price, pct_price_cut, stars])
    #print the written row
    print(f'{index+1}: {item}\t{old_price}\t{new_price}\t{pct_price_cut}\t{stars}')
#close csv file
csv_file.close()
    

In [None]:
#import numpy and pandas
import pandas as pd
import numpy as np

In [None]:
#read created csv file into a dataframe
df=pd.read_csv('Mens_shoes.csv')

In [None]:
#view top 5 rows of dataframe
df.head()

In [None]:
#brief description of dataframe
df.info()

In [None]:
#get only the rating value out of 5
#rename rating to rating/5
df['rating']=df['rating'].str.extract('(^\d\.?\d?)')
df.rename(columns={'rating':'rating/5'}, inplace=True)
df.head()

In [None]:
#remove % symbol from discount variable
df['discount']=df['discount'].str.extract('(\d{1,3})')
df.head()

In [None]:
#create column to separate old price and old price max
df[['old price', 'old price max']]=df['old price'].str.extract('UGX\s(\d{1,3}\,\d{3})\s?-?\s?U?G?X?\s?(\d{1,3}\,\d{3})?')
df.head()

In [None]:
#create column to separate new price and new price max
df[['new price', 'new price max']]=df['new price'].str.extract('UGX\s(\d{1,3}\,\d{3})\s?-?\s?U?G?X?\s?(\d{1,3}\,\d{3})?')
df.head()

In [None]:
#remove commas in price values
df[['old price','new price','old price max','new price max']]=df[['old price','new price','old price max','new price max']].apply(lambda x: x.str.replace(',' , ''))
df.head()

In [None]:
#convert number columns to numerical
num_index=df.columns.drop('Item')
print(num_index)
df[num_index]=df[num_index].astype('float32')
df.head()

In [None]:
#convert discount to decimal
df['discount']=df['discount']/100
df.head()

In [None]:
#extract color from item
df['color']=df['Item'].str.extract('.+-([\s&,\w]*)\.?$')
df.head()

In [None]:
#final cleaned dataset
df

In [None]:
#convert cleaned data set to csv file
df.to_csv('Mens_shoes_cleaned.csv')