# Reading the Raw data to DF

In [6]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('C:/Users/chunshen/Downloads/meta_Electronics.json.gz')

### Slicing the relevant data (Title and Description)

In [8]:
df.columns

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')

In [15]:
new_df = df[['title','description']]


In [24]:
new_df.to_csv('dataframe_slice.csv',encoding='utf8')

### Cleaning the Data

In [28]:
data = pd.read_csv('dataframe_slice.csv',index_col=0)

In [29]:
data

Unnamed: 0,title,description
0,Genuine Geovision 1 Channel 3rd Party NVR IP S...,['The following camera brands and models have ...
1,"Books ""Handbook of Astronomical Image Processi...","[""This second edition of the Handbook of Astro..."
2,One Hot Summer,['A zesty tale. (Publishers Weekly)<br /><br /...
3,Hurray for Hattie Rabbit: Story and pictures (...,[]
4,sex.lies.murder.fame.: A Novel,['&#8220;sex.lies.murder.fame. is brillllli&#8...
...,...,...
786440,Tukzer Cassette Player Portable Tape Player Ca...,['<b>Specification: </b><br>Item Type: Cassett...
786441,Cobiter AC Adapter Battery Charger For IBM LEN...,"[""<br />Cobiter AC adapters are made with the ..."
786442,Baofeng Original Li-ion Battery Charger Deskto...,['</b>Specifications:<br><br> Original Li-ion ...
786443,Jricoo 3pack 6ft Micro USB to USB Cable 2.0 6f...,[]


In [33]:
index_to_remove=[]
for idx,title,desc in data.itertuples():
    if desc =='[]':
        index_to_remove.append(idx)
index_to_remove    

[3,
 9,
 14,
 33,
 43,
 50,
 58,
 59,
 60,
 62,
 75,
 99,
 105,
 109,
 110,
 119,
 127,
 150,
 157,
 176,
 177,
 200,
 211,
 404,
 448,
 451,
 462,
 474,
 493,
 519,
 558,
 580,
 604,
 616,
 631,
 652,
 655,
 670,
 679,
 680,
 686,
 690,
 696,
 698,
 701,
 708,
 709,
 710,
 712,
 713,
 745,
 749,
 751,
 758,
 762,
 764,
 787,
 815,
 843,
 851,
 852,
 855,
 861,
 862,
 864,
 866,
 875,
 876,
 887,
 890,
 891,
 892,
 893,
 894,
 895,
 896,
 897,
 913,
 914,
 915,
 917,
 918,
 919,
 923,
 925,
 928,
 938,
 940,
 941,
 942,
 943,
 945,
 946,
 948,
 955,
 963,
 964,
 971,
 975,
 978,
 990,
 993,
 995,
 1000,
 1002,
 1004,
 1005,
 1148,
 1151,
 1264,
 1267,
 1286,
 1321,
 1342,
 1355,
 1377,
 1403,
 1414,
 1418,
 1431,
 1458,
 1682,
 1833,
 1849,
 1855,
 1874,
 1914,
 1939,
 1946,
 1969,
 1982,
 1987,
 1994,
 2007,
 2012,
 2015,
 2016,
 2059,
 2068,
 2103,
 2208,
 2272,
 2293,
 2339,
 2341,
 2367,
 2391,
 2398,
 2425,
 2436,
 2441,
 2465,
 2478,
 2511,
 2519,
 2520,
 2523,
 2527,
 2531,
 253

In [34]:
data.drop(index=index_to_remove,inplace=True)

In [43]:
def length(x):
    try:
        return len(x)
    except:
        return 0

data['title_len']=data['title'].apply(length)

In [50]:
data['desc_len']=data['description'].apply(lambda x:len(x))

In [56]:
data_clean = data[(data['title_len']>=5) & (data['desc_len']>=10)]

In [59]:
data_clean.to_csv('dataframe_slice.csv')

#### Additional Cleaning

In [1]:
import pandas as pd
data = pd.read_csv('dataframe_slice.csv',engine='python',error_bad_lines=False,index_col=0)
data

Skipping line 79790: field larger than field limit (131072)
Skipping line 163735: unexpected end of data


Unnamed: 0,title,description,title_len,desc_len
0,Genuine Geovision 1 Channel 3rd Party NVR IP S...,['The following camera brands and models have ...,80,793
1,"Books ""Handbook of Astronomical Image Processi...","[""This second edition of the Handbook of Astro...",113,1471
2,One Hot Summer,['A zesty tale. (Publishers Weekly)<br /><br /...,14,628
4,sex.lies.murder.fame.: A Novel,['&#8220;sex.lies.murder.fame. is brillllli&#8...,30,972
6,Girl with a One-track Mind: Confessions of the...,['GIRL WITH A ONE-TRACK MIND: CONFESSIONS OF T...,67,71
...,...,...,...,...
180615,ASRock Atom Dual-Core 330/DVI&amp;HDMI/A&amp;V...,['ASRock A330ION Atom Dual-Core 330/DVI&HDMI/A...,83,75
180616,DURAGADGET comfortable in ear headphones for S...,['High-quality audio from a comfortable in-ear...,72,357
180617,Sony VAIO Keyboard Skin,['Add a splash of personality to the inside of...,23,396
180618,CyberpowerPC Gamer LiquidCool U101 Desktop PC ...,"['Accomplish all your computing tasks easier, ...",134,730


In [2]:
#This function is to sample the dataframe to check any anomalies that might cause the data to be inaccurate
def sample(df):
    new_df = df.sample(frac=0.00005)
    for idx,title,desc,t_len,d_len in new_df.itertuples():
        print(f"title is :{title}")
        print('\n')
        print(f"description is :{desc}")
    return 'end'
        

In [14]:
data['description'] = data['description'].apply(clean)

In [12]:
# Re moves basic punctuation that adds no value to the description
def clean(x):
    to_remove = ['\'','\"','[',']']
    for i in to_remove:
        x=x.replace(i,'')
    return str(x)

clean(data.iloc[1]['description'])

'This second edition of the Handbook of Astronomical Image Processing (HAIP) and its integral AIP for Windows 2.0 image processing software (AIP4Win2.0) addresses many important changes that have taken place in astronomical imaging since the publication of the first edition.  Todays affordable astro-imaging capable digital single-lens-reflex cameras (DSLRs), the growing power of personal computers, and the proliferation of telescopes and imaging accessories has brought imaging capabilities within the reach of practically every amateur astronomer - and this second edition of the Handbook plus AIP4Win 2.0 is ready, willing, and able to assist every observer in making great astronomical images.  In the Handbook, we amplified the original chapters on astronomical equipment and imaging techniques, revised our discussions of astrometry and photometry to reflect the steady growth in these scientific fields, and expanded tutorials in the back of the book to help you get up to speed quickly.  O

In [33]:
# This function removes cases of < blah blah bladqd > encapsulated by "<" and ">".
def remove_extra_punc(x):
    x_new = ''
    if '<' in x:
        stop = False
        for i in x:
            if i == '<':
                stop = True
            if i == '>':
                stop = False
                continue
            
            if stop == False:
                x_new += i
        return x_new
    else:
        return x

data['description'] = data['description'].apply(remove_extra_punc)

### Counting length based on ' ' spacing

In [34]:
data = data[['title','description']]
data['desc_len']=data['description'].apply(lambda x: len(x.split()))  
data['title_len']=data['title'].apply(lambda x: len(x.split()))  
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['desc_len']=data['description'].apply(lambda x: len(x.split()))


Unnamed: 0,title,description,desc_len,title_len
0,Genuine Geovision 1 Channel 3rd Party NVR IP S...,The following camera brands and models have be...,107,14
1,"Books ""Handbook of Astronomical Image Processi...",This second edition of the Handbook of Astrono...,217,17
2,One Hot Summer,A zesty tale. (Publishers Weekly)Garcia Aguile...,89,3
4,sex.lies.murder.fame.: A Novel,&#8220;sex.lies.murder.fame. is brillllli&#821...,124,3
6,Girl with a One-track Mind: Confessions of the...,GIRL WITH A ONE-TRACK MIND: CONFESSIONS OF THE...,11,11
...,...,...,...,...
180615,ASRock Atom Dual-Core 330/DVI&amp;HDMI/A&amp;V...,ASRock A330ION Atom Dual-Core 330/DVI&HDMI/A&V...,6,6
180616,DURAGADGET comfortable in ear headphones for S...,High-quality audio from a comfortable in-ear h...,51,11
180617,Sony VAIO Keyboard Skin,Add a splash of personality to the inside of y...,64,4
180618,CyberpowerPC Gamer LiquidCool U101 Desktop PC ...,"Accomplish all your computing tasks easier, fa...",39,21


In [35]:
desc_mean = data['desc_len'].median()
title_mean = data['title_len'].median()
print(f'The median title length is {title_mean} and the median description length is {desc_mean}.')

The median title length is 9.0 and the median description length is 67.0.


In [41]:
#Run this cell to sample 10 titles and description
sample(data)

title is : Wired Mouse - 300 C Black


description is :Its not often that a computer accessory puts a smile on the face of each person who sees it. Road Mice, the officially-licensed computer car mouse, appeals to computer users who love cars. Car enthusiasts, young and old, love these replica automobiles that work as fully-functional computer mice. Road Mice have smooth lines, realistic details, glossy finish and working LED headlights! The hood is split in two segments with a scroll wheel in the middle, so that users can navigate each right and left click with ease. Each mouse comes with a unique VIN number, allowing owners to register their car, receive a title and activate the 12 month product warranty. PC and Mac compatible via available USB port. Four Door Media, the makers of Road Mice, creates, manufacturers, markets and distributes computer-based products and accessories that entertain, promote and educate. The company focuses on marketing products that relate to the automotiv

'end'

In [43]:
#Saving the cleaned data
data.to_csv('data_cleaned.csv')