In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import re
plt.style.use('seaborn')

In [4]:
with open('scraped_data_utf8.json') as file:
    data = json.loads(file.read())

In [6]:
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

df = pd.DataFrame.from_dict(data,  orient='index')
df.style.format({'url': make_clickable})
print(df.shape)
df.head()

(640, 8)


Unnamed: 0,title,category,price,location,users,rating,votes,url
0,Dronų pilotavimo pamoka,Oro pramogos,29.0,Vilnius,12,,,https://www.geradovana.lt/dronu-pilotavimo-pam...
1,Dovanų kortelė | GERA DOVANA,Oro pramogos,,"Vilnius(aps.),Kaunas(aps.),Klaipėda(aps.),Pala...","Neribojama,1,2,3+",4.5,2.0,https://www.geradovana.lt/dovanu-kortele--gera...
2,Dovanų rinkinys ATITRŪK,Oro pramogos,199.0,"Vilnius,Kaunas(aps.),Klaipėda(aps.),Palanga,Dr...",12,4.6,7.0,https://www.geradovana.lt/dovanu-rinkinys-atit...
3,Vakarienės prenumerata,Pramogos namuose,,"Vilnius,Kaunas",Neribojama,5.0,2.0,https://www.geradovana.lt/vakarienes-prenumera...
4,Žurnalo LAIMĖ prenumerata,Pramogos namuose,14.99,VisaLietuva,1,,,https://www.geradovana.lt/zurnalo-laima-prenum...


Dataset has 583 observations and 8 columns.  

Description of columns:  
`title` : title of coupon  
`category` : to which category the item belongs  
`price` : what is the price for coupon? None indicates that one could choose coupon value  
`location`: where one can get the service  
`users`: how many participants  
`rating`: rating from 0 to 5  
`votes`: how many votes was used to calculate rating  
`url`: url of copoun, in case to get more details

Most columns contains plain text thus the main objective is to extract valuable information from it. The key column is however `price`


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 583 entries, 0 to 582
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   title     583 non-null    object 
 1   category  583 non-null    object 
 2   price     522 non-null    float64
 3   location  581 non-null    object 
 4   users     549 non-null    object 
 5   rating    234 non-null    object 
 6   votes     234 non-null    object 
 7   url       583 non-null    object 
dtypes: float64(1), object(7)
memory usage: 61.0+ KB


`price` column contains 61 missing value and it's represented as object (string). I will perform cleaning procedure with correct representation of missing values and as float type

In [69]:
# Lets create copy of original dataframe for cleaning
clean_df = df.copy()

In [70]:
clean_df.price = pd.to_numeric(clean_df.price, errors='coerce')
clean_df.price.describe() # TEST

count     522.000000
mean       70.828487
std       163.790986
min         4.190000
25%        23.340000
50%        39.000000
75%        74.750000
max      2690.000000
Name: price, dtype: float64

In [71]:
print("Unique values: ", clean_df.category.nunique())
clean_df.category.unique()

Unique values:  13


array(['Tamsa', 'Poilsis su nakvyne', 'Vandens pramogos',
       'Superautomobiliai', 'TOP aktyvus laisvalaikis', 'Grožis',
       'Geras skonis', 'Spa ir masažai', 'Sveikatinimas',
       'Kūrybiškos, linksmos', 'Žemės pramogos', 'Pramogos namuose',
       'Oro pramogos'], dtype=object)

Category column contains 13 unique values. I will not perform cleaning on it

In [72]:
clean_data = clean_df.location.str.split(",", expand=True).merge(clean_df, right_index=True, left_index=True) \
                .drop('location', axis=1) \
                .melt(id_vars=['title','category','price','users','rating','votes','url'], value_name="location") \
                .dropna(subset=['location'])
clean_data.shape

(939, 9)

`location` columns could contain multiple values separated by comma. I have splitted them and create new row for each of them Because of that my data has 939 observations instead of 583.

In [67]:
from IPython.display import IFrame

url = 'https://www.wikipedia.org'
IFrame(url, width=800, height=400)

(939, 9)