#    QUESTION 3
####  NAME: RISHAV KUMAR
####  ROLL NO. 2301560042

MY GITHUB LINK: [click me](https://github.com/risav68111/AIML_Assignment)

------------------------------------------------------------------------------------------------------------------------------------------------------------

## Data Validation with Voluptuous (Schema Definitions)


In [None]:
import logging
import pandas as pd
from datetime import datetime
from voluptuous import Schema, Required, Range, All, ALLOW_EXTRA
from voluptuous.error import MultipleInvalid, Invalid


In [None]:
logger = logging.getLogger(0)
logger.setLevel(logging.WARNING)

In [None]:
path =r'C:\Users\risha\Documents\KRMU\AIML_assigment\datasets\sales_data.csv'
sales = pd.read_csv(path)
sales.head()

Unnamed: 0.1,Unnamed: 0,timestamp,city,store_id,sale_number,sale_amount,associate
0,0,2018-09-10 05:00:45,Williamburgh,6,1530,1167.0,Gary Lee
1,1,2018-09-12 10:01:27,Ibarraberg,1,2744,258.0,Daniel Davis
2,2,2018-09-13 12:01:48,Sarachester,2,1908,266.0,Michael Roth
3,3,2018-09-14 20:02:19,Caldwellbury,14,771,-108.0,Michaela Stewart
4,4,2018-09-16 01:03:21,Erikaland,11,1571,-372.0,Mark Taylor


In [None]:
sales=sales.drop(['Unnamed: 0'], axis=1)

In [None]:
sales.dtypes

timestamp       object
city            object
store_id         int64
sale_number      int64
sale_amount    float64
associate       object
dtype: object

In [None]:
sales['timestamp'].map(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))

0     2018-09-10 05:00:45
1     2018-09-12 10:01:27
2     2018-09-13 12:01:48
3     2018-09-14 20:02:19
4     2018-09-16 01:03:21
              ...        
208   2019-09-01 06:46:44
209   2019-09-03 12:47:26
210   2019-09-05 18:47:30
211   2019-09-07 23:48:08
212   2018-09-09 04:48:48
Name: timestamp, Length: 213, dtype: datetime64[ns]

### Data Quality Check

In [None]:
sales.head()

Unnamed: 0,timestamp,city,store_id,sale_number,sale_amount,associate
0,2018-09-10 05:00:45,Williamburgh,6,1530,1167.0,Gary Lee
1,2018-09-12 10:01:27,Ibarraberg,1,2744,258.0,Daniel Davis
2,2018-09-13 12:01:48,Sarachester,2,1908,266.0,Michael Roth
3,2018-09-14 20:02:19,Caldwellbury,14,771,-108.0,Michaela Stewart
4,2018-09-16 01:03:21,Erikaland,11,1571,-372.0,Mark Taylor


In [None]:
sales.dtypes

timestamp       object
city            object
store_id         int64
sale_number      int64
sale_amount    float64
associate       object
dtype: object

## Defining our first schema

In [None]:
schema = Schema({ Required('sale_amount'): All(float, Range(min=2.50, max=1450.99)),}, extra=ALLOW_EXTRA)

In [None]:
error_count = 0
for s_id, sale in sales.T.to_dict().items():
    try:
        schema(sale)
    except MultipleInvalid as e:
        logging.warning('issue with sale: %s (%s) - %s', s_id, sale['sale_amount'], e)
        error_count += 1



In [None]:
error_count

69

In [None]:
sales.shape

(213, 6)

### Questions we might want to answer:
- Do we have an improperly defined schema?
- Are negative values possibly returns or falsely marked? (data entry proceedures)
- Are higher values combined purchases or special sales? (or potentially fraud?)
- What should we do with our schema and our failing data points?

### Adding a custom Validation Case

In [None]:
def ValidDate(fmt='%Y-%m-%d %H:%M:%S'):
    return lambda v: datetime.strptime(v, fmt)

In [None]:
schema = Schema({
    Required('timestamp'): All(ValidDate()),}, extra=ALLOW_EXTRA)

In [None]:
error_count = 0
for s_id, sale in sales.T.to_dict().items():
    try:
        schema(sale)
    except MultipleInvalid as e:
        logging.warning('issue with sale: %s (%s) - %s', s_id, sale['timestamp'], e)
        error_count += 1

In [None]:
error_count

0

## So we have valid date structures, what about actual valid dates?

In [None]:
def ValidDate(fmt='%Y-%m-%d %H:%M:%S'):
    def validation_func(v):
        try:
            assert datetime.strptime(v, fmt) <= datetime.now()
        except AssertionError:
            raise Invalid('date is in the future! %s' % v)
    return validation_func

In [None]:
schema = Schema({
    Required('timestamp'): All(ValidDate()),}, extra=ALLOW_EXTRA)

In [None]:
error_count = 0
for s_id, sale in sales.T.to_dict().items():
    try:
        schema(sale)
    except MultipleInvalid as e:
        logging.warning('issue with sale: %s (%s) - %s', 
                        s_id, sale['timestamp'], e)
        error_count += 1

In [None]:
error_count

0

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(r'C:\Users\risha\Documents\KRMU\AIML_assigment\datasets\HVAC_with_nulls.csv', encoding='utf-8')

In [None]:
df.head()

Unnamed: 0,Date,Time,TargetTemp,ActualTemp,System,SystemAge,BuildingID,10
0,6/1/13,0:00:01,66.0,58,13,20.0,4,
1,6/2/13,1:00:01,,68,3,20.0,17,
2,6/3/13,2:00:01,70.0,73,17,20.0,18,
3,6/4/13,3:00:01,67.0,63,2,,15,
4,6/5/13,4:00:01,68.0,74,16,9.0,3,


In [None]:
df.isnull().sum()

Date             0
Time             0
TargetTemp     760
ActualTemp       0
System           0
SystemAge      753
BuildingID       0
10            8000
dtype: int64

In [None]:
df.isna()

Unnamed: 0,Date,Time,TargetTemp,ActualTemp,System,SystemAge,BuildingID,10
0,False,False,False,False,False,False,False,True
1,False,False,True,False,False,False,False,True
2,False,False,False,False,False,False,False,True
3,False,False,False,False,False,True,False,True
4,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...
7995,False,False,False,False,False,False,False,True
7996,False,False,False,False,False,False,False,True
7997,False,False,True,False,False,False,False,True
7998,False,False,False,False,False,False,False,True


In [None]:
df.drop_duplicates()

Unnamed: 0,Date,Time,TargetTemp,ActualTemp,System,SystemAge,BuildingID,10
0,6/1/13,0:00:01,66.0,58,13,20.0,4,
1,6/2/13,1:00:01,,68,3,20.0,17,
2,6/3/13,2:00:01,70.0,73,17,20.0,18,
3,6/4/13,3:00:01,67.0,63,2,,15,
4,6/5/13,4:00:01,68.0,74,16,9.0,3,
...,...,...,...,...,...,...,...,...
7995,6/16/13,1:33:07,66.0,58,17,18.0,20,
7996,6/17/13,2:33:07,68.0,72,17,27.0,12,
7997,6/18/13,3:33:07,,69,10,4.0,3,
7998,6/19/13,4:33:07,65.0,63,7,23.0,20,


In [None]:
df=df.drop(['10'], axis=1)

In [None]:
df['TargetTemp'] = df['TargetTemp'].fillna(df['TargetTemp'].mean())


In [None]:
df.TargetTemp.isna().sum()


0

In [None]:
df.ActualTemp.isna().sum()

0

In [None]:
df.SystemAge.isna().sum()

753

In [None]:
df.SystemAge= df.SystemAge.fillna(df.SystemAge.mean())

In [None]:
df.SystemAge.isna().sum()

0

### Managing Nulls 

In [None]:
import pandas as pd
from numpy import random

In [None]:
df = pd.read_csv(r'C:\Users\risha\Documents\KRMU\AIML_assigment\datasets\iot_example_with_nulls.csv')

### Data Quality Check

In [None]:
df.head()

Unnamed: 0,timestamp,username,temperature,heartrate,build,latest,note
0,2017-01-01T12:00:23,michaelsmith,12.0,67,4e6a7805-8faa-2768-6ef6-eb3198b483ac,0.0,interval
1,2017-01-01T12:01:09,kharrison,6.0,78,7256b7b0-e502-f576-62ec-ed73533c9c84,0.0,wake
2,2017-01-01T12:01:34,smithadam,5.0,89,9226c94b-bb4b-a6c8-8e02-cb42b53e9c90,0.0,
3,2017-01-01T12:02:09,eddierodriguez,28.0,76,,0.0,update
4,2017-01-01T12:02:36,kenneth94,29.0,62,122f1c6a-403c-2221-6ed1-b5caa08f11e0,,


In [None]:
df.dtypes

timestamp       object
username        object
temperature    float64
heartrate        int64
build           object
latest         float64
note            object
dtype: object

In [None]:
df.note.value_counts()

note
wake        16496
user        16416
interval    16274
sleep       16226
update      16213
test        16068
Name: count, dtype: int64

### Let's remove all null values (including the note: n/a)

In [None]:
df = pd.read_csv(r'C:\Users\risha\Documents\KRMU\AIML_assigment\datasets\iot_example_with_nulls.csv', na_values=['n/a'])

### Test to see if we can use dropna

In [None]:
df.shape

(146397, 7)

In [None]:
df.dropna().shape

(46116, 7)

In [None]:
df.dropna(how='all', axis=1).shape

(146397, 7)

### Test to see if we can drop columns

In [None]:
my_columns = list(df.columns)

In [None]:
my_columns

['timestamp',
 'username',
 'temperature',
 'heartrate',
 'build',
 'latest',
 'note']

In [None]:
list(df.dropna(thresh=int(df.shape[0] * .9), axis=1).columns)

['timestamp', 'username', 'heartrate']

### I want to find all columns that have missing data

In [None]:
missing_info = list(df.columns[df.isnull().any()])

In [None]:
missing_info

['temperature', 'build', 'latest', 'note']

In [None]:
for col in missing_info:
    num_missing = df[df[col].isnull() == True].shape[0]
    print('number missing for column {}: {}'.format(col, 
                                                    num_missing))

number missing for column temperature: 32357
number missing for column build: 32350
number missing for column latest: 32298
number missing for column note: 48704


In [None]:
for col in missing_info:
    percent_missing = df[df[col].isnull() == True].shape[0] / df.shape[0]
    print('percent missing for column {}: {}'.format(
        col, percent_missing))

percent missing for column temperature: 0.22102228870810195
percent missing for column build: 0.22097447352063226
percent missing for column latest: 0.22061927498514314
percent missing for column note: 0.332684412931959


### Can I easily substitute majority values in for missing data?

In [None]:
df.note.value_counts()

note
wake        16496
user        16416
interval    16274
sleep       16226
update      16213
test        16068
Name: count, dtype: int64

In [None]:
df.build.value_counts().head()

build
4e6a7805-8faa-2768-6ef6-eb3198b483ac    1
12aefc6b-272c-751e-6117-134ee73e2649    1
fd4049c3-2297-14ac-a27e-6da57129dd10    1
0bcfab8f-bc25-3f8f-8585-0614e1555fd1    1
b0de05dd-2860-abbb-8be6-f5c0e30ca063    1
Name: count, dtype: int64

In [None]:
df.latest.value_counts()

latest
0.0    75735
1.0    38364
Name: count, dtype: int64

In [None]:
df.latest = df.latest.fillna(0)

### Have not yet addressed temperature missing values... Let's find a way to fill

In [None]:
df.username.value_counts().head()

username
esmith    45
zsmith    43
vsmith    41
ysmith    40
jsmith    37
Name: count, dtype: int64

In [None]:
df = df.set_index('timestamp')

In [None]:
df.head()

Unnamed: 0_level_0,username,temperature,heartrate,build,latest,note
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-01T12:00:23,michaelsmith,12.0,67,4e6a7805-8faa-2768-6ef6-eb3198b483ac,0.0,interval
2017-01-01T12:01:09,kharrison,6.0,78,7256b7b0-e502-f576-62ec-ed73533c9c84,0.0,wake
2017-01-01T12:01:34,smithadam,5.0,89,9226c94b-bb4b-a6c8-8e02-cb42b53e9c90,0.0,
2017-01-01T12:02:09,eddierodriguez,28.0,76,,0.0,update
2017-01-01T12:02:36,kenneth94,29.0,62,122f1c6a-403c-2221-6ed1-b5caa08f11e0,0.0,


In [None]:
df.temperature = df.groupby(df['username']).temperature.fillna(df.temperature.mean())

### Exercise: How many temperature values did I fill? What percentage of values are still missing (for temperature)?

In [None]:
rows_filled = 32357 - df[df.temperature.isnull() == True].shape[0] 
still_missing = df[df.temperature.isnull() == True].shape[0] / df.shape[0]


In [None]:
rows_filled

32357

In [None]:
still_missing

0.0

In [None]:
from fuzzywuzzy import fuzz, process

In [None]:
berlin = ['Berlin, Germany', 
          'Berlin, Deutschland', 
          'Berlin', 
          'Berlin, DE']

In [None]:
fuzz.partial_ratio(berlin[0], berlin[1])

60

In [None]:
fuzz.ratio?

[1;31mSignature:[0m [0mfuzz[0m[1;33m.[0m[0mratio[0m[1;33m([0m[0ms1[0m[1;33m,[0m [0ms2[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mFile:[0m      c:\users\risha\appdata\local\programs\python\python312\lib\site-packages\fuzzywuzzy\fuzz.py
[1;31mType:[0m      function

In [None]:
fuzz.ratio(berlin[0], berlin[1])

65

In [None]:
fuzz.token_set_ratio(berlin[0], berlin[1])

62

In [None]:
fuzz.token_sort_ratio(berlin[0], berlin[1])

62

In [None]:
fuzz.partial_ratio(berlin[1], berlin[2])

100

In [None]:
fuzz.ratio(berlin[1], berlin[2])

48

In [None]:
fuzz.token_sort_ratio(berlin[1], berlin[2])

50

In [None]:
fuzz.token_set_ratio(berlin[2], berlin[3])

100

In [None]:
choices = ['Germany', 'Deutschland', 'France', 
           'United Kingdom', 'Great Britain', 
           'United States']

In [None]:
process.extract('DE', choices, limit=2)

[('Deutschland', 90), ('United States', 57)]

In [None]:
process.extract('UK', choices)

[('Deutschland', 45),
 ('United Kingdom', 45),
 ('United States', 45),
 ('Germany', 0),
 ('France', 0)]

In [None]:
process.extract('frankreich', choices)

[('France', 62),
 ('Great Britain', 41),
 ('Germany', 35),
 ('United Kingdom', 25),
 ('United States', 25)]

In [None]:
process.extract('U.S.', choices)

[('United States', 86),
 ('Deutschland', 60),
 ('United Kingdom', 57),
 ('Great Britain', 30),
 ('Germany', 0)]

In [None]:
from sklearn import preprocessing
import pandas as pd
from datetime import datetime
from sklearn.impute import SimpleImputer

In [None]:
hvac = pd.read_csv(r"C:\Users\risha\Documents\KRMU\AIML_assigment\datasets\HVAC_with_nulls.csv")

### Checking Data Quality

In [None]:
hvac.dtypes

Date           object
Time           object
TargetTemp    float64
ActualTemp      int64
System          int64
SystemAge     float64
BuildingID      int64
10            float64
dtype: object

In [None]:
hvac.shape

(8000, 8)

In [None]:
hvac= hvac.drop(['10'], axis=1)

In [None]:
hvac.head()

Unnamed: 0,Date,Time,TargetTemp,ActualTemp,System,SystemAge,BuildingID
0,6/1/13,0:00:01,66.0,58,13,20.0,4
1,6/2/13,1:00:01,,68,3,20.0,17
2,6/3/13,2:00:01,70.0,73,17,20.0,18
3,6/4/13,3:00:01,67.0,63,2,,15
4,6/5/13,4:00:01,68.0,74,16,9.0,3


### Impute missing values with mean

In [None]:

# imp = SimpleImputer(missing_values='NaN', strategy='mean')
hvac.TargetTemp= hvac.TargetTemp.fillna(hvac.TargetTemp.mean())

In [None]:
hvac_numeric = hvac[['TargetTemp', 'SystemAge']]

In [None]:
hvac.head()

Unnamed: 0,Date,Time,TargetTemp,ActualTemp,System,SystemAge,BuildingID
0,6/1/13,0:00:01,66.0,58,13,20.0,4
1,6/2/13,1:00:01,67.507735,68,3,20.0,17
2,6/3/13,2:00:01,70.0,73,17,20.0,18
3,6/4/13,3:00:01,67.0,63,2,,15
4,6/5/13,4:00:01,68.0,74,16,9.0,3


### Scale temperature values

In [None]:
hvac['ScaledTemp'] = preprocessing.scale(hvac['ActualTemp'])

In [None]:
hvac['ScaledTemp'].head()

0   -1.293272
1    0.048732
2    0.719733
3   -0.622270
4    0.853934
Name: ScaledTemp, dtype: float64

### Scale using a min and max scaler 

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()

In [None]:
temp_minmax = min_max_scaler.fit_transform(hvac[['ActualTemp']])

In [None]:
temp_minmax

array([[0.12],
       [0.52],
       [0.72],
       ...,
       [0.56],
       [0.32],
       [0.44]])

### Exercise: add the temp_minmax back to the dataframe as a new column

In [None]:
hvac['MinMaxScaledTemp'] = temp_minmax[:,0]
hvac['MinMaxScaledTemp'].head()

0    0.12
1    0.52
2    0.72
3    0.32
4    0.76
Name: MinMaxScaledTemp, dtype: float64

## Case Study: Preparing Lobste.rs

In [None]:
import pandas as pd
import requests
from fuzzywuzzy import fuzz
from collections import Counter
from sklearn import preprocessing

### If you'd rather read from the API to get the latest, uncomment the details (and add comment to the final line)

In [None]:
stories = pd.read_json(r'C:\Users\risha\Documents\KRMU\AIML_assigment\datasets\all_lobsters.json')

In [None]:
stories.head()

Unnamed: 0,comment_count,comments_url,created_at,description,downvotes,last_updated,score,short_id_url,submitter_user,tags,title,upvotes,url
09zw7r,0,https://lobste.rs/s/09zw7r/edited_truth,2017-08-08 20:11:09,,0,2017-08-09T11:03:57.014269,3,https://lobste.rs/s/09zw7r,{'avatar_url': 'https://lobste.rs/avatars/trn-...,"[crypto, pdf]",The Edited Truth,3,https://eprint.iacr.org/2017/714.pdf
0bdne7,17,https://lobste.rs/s/0bdne7/rise_social_media_v...,2017-08-08 21:12:38,,9,2017-08-09T11:03:57.014269,-1,https://lobste.rs/s/0bdne7,{'avatar_url': 'https://lobste.rs/avatars/nkhu...,"[law, privacy]",The Rise of The Social Media Vigilante,8,https://medium.com/@nkhumphreys_89452/the-rise...
1bhbod,11,https://lobste.rs/s/1bhbod/tcl_misunderstood_a...,2017-04-30 20:28:52,<p>Did any language end up taking that “highly...,0,2017-05-01T06:29:11.725518,17,https://lobste.rs/s/1bhbod,"{'is_moderator': False, 'is_admin': False, 'us...",[programming],Tcl the misunderstood - antirez,17,http://antirez.com/articoli/tclmisunderstood.html
1xkje1,0,https://lobste.rs/s/1xkje1/interview_4_jonatha...,2017-05-01 02:31:35,<p>Rust’s own Jonathan Turner on his backgroun...,0,2017-05-01T06:29:11.725518,1,https://lobste.rs/s/1xkje1,"{'is_moderator': False, 'is_admin': False, 'us...","[audio, javascript, rust]",🎤🎙 Interview 4 – Jonathan Turner: Part 1/3,1,http://www.newrustacean.com/show_notes/intervi...
2dasvh,19,https://lobste.rs/s/2dasvh/return_hipster_pda,2017-08-08 14:25:29,,0,2017-08-09T11:03:56.287654,20,https://lobste.rs/s/2dasvh,{'created_at': '2017-01-19T14:56:50.000-06:00'...,[practices],The Return of the Hipster PDA,20,http://www.agilesysadmin.net/return-of-the-hip...


In [None]:
stories.dtypes

comment_count              int64
comments_url              object
created_at        datetime64[ns]
description               object
downvotes                  int64
last_updated              object
score                      int64
short_id_url              object
submitter_user            object
tags                      object
title                     object
upvotes                    int64
url                       object
dtype: object

### Let's take a look at the submitter_user field, as it appears like a dict

In [None]:
stories.submitter_user.iloc[3]

{'is_moderator': False,
 'is_admin': False,
 'username': 'chriskrycho',
 'karma': 27,
 'avatar_url': 'https://secure.gravatar.com/avatar/c096ed07142659408dc6651f8320acd3?r=pg&d=identicon&s=100',
 'created_at': '2016-08-15T09:33:28.000-05:00',
 'about': "I'm a husband and father; a theologian, composer, poet, and essayist; a front end developer at [Olo](http://www.olo.com); a [Rust](https://www.rust-lang.org/en-US/) enthusiast host; and the host of the [Winning Slowly](http://www.winningslowly.org), [New Rustacean](http://www.newrustacean.com/), [Sap.py](http://www.sap-py.com), and [Run With Me](http://runwith.chriskrycho.com/) podcasts."}

In [None]:
user_df = stories['submitter_user'].apply(pd.Series)

In [None]:
user_df.head()

Unnamed: 0,avatar_url,created_at,is_admin,username,karma,is_moderator,about,github_username
09zw7r,https://lobste.rs/avatars/trn-100.png,2017-01-19T14:56:50.000-06:00,False,trn,429,False,,
0bdne7,https://lobste.rs/avatars/nkhumphreys-100.png,2014-07-02T06:36:39.000-05:00,False,nkhumphreys,-1,False,Web developer and previously embedded C developer,
1bhbod,https://secure.gravatar.com/avatar/85002353297...,2016-11-30T10:14:24.000-06:00,False,yumaikas,578,False,I blog infrequently at https://junglecoder.com...,
1xkje1,https://secure.gravatar.com/avatar/c096ed07142...,2016-08-15T09:33:28.000-05:00,False,chriskrycho,27,False,"I'm a husband and father; a theologian, compos...",
2dasvh,https://lobste.rs/avatars/trn-100.png,2017-01-19T14:56:50.000-06:00,False,trn,429,False,,


### Can we combine the user data without potential column overlap?

In [None]:
set(user_df.columns).intersection(stories.columns)

{'created_at'}

In [None]:
user_df = user_df.rename(columns={'created_at': 
                                  'user_created_at'})

In [None]:
stories = pd.concat([stories.drop(['submitter_user'], axis=1), 
                     user_df], axis=1)

In [None]:
stories.head()

Unnamed: 0,comment_count,comments_url,created_at,description,downvotes,last_updated,score,short_id_url,tags,title,upvotes,url,avatar_url,user_created_at,is_admin,username,karma,is_moderator,about,github_username
09zw7r,0,https://lobste.rs/s/09zw7r/edited_truth,2017-08-08 20:11:09,,0,2017-08-09T11:03:57.014269,3,https://lobste.rs/s/09zw7r,"[crypto, pdf]",The Edited Truth,3,https://eprint.iacr.org/2017/714.pdf,https://lobste.rs/avatars/trn-100.png,2017-01-19T14:56:50.000-06:00,False,trn,429,False,,
0bdne7,17,https://lobste.rs/s/0bdne7/rise_social_media_v...,2017-08-08 21:12:38,,9,2017-08-09T11:03:57.014269,-1,https://lobste.rs/s/0bdne7,"[law, privacy]",The Rise of The Social Media Vigilante,8,https://medium.com/@nkhumphreys_89452/the-rise...,https://lobste.rs/avatars/nkhumphreys-100.png,2014-07-02T06:36:39.000-05:00,False,nkhumphreys,-1,False,Web developer and previously embedded C developer,
1bhbod,11,https://lobste.rs/s/1bhbod/tcl_misunderstood_a...,2017-04-30 20:28:52,<p>Did any language end up taking that “highly...,0,2017-05-01T06:29:11.725518,17,https://lobste.rs/s/1bhbod,[programming],Tcl the misunderstood - antirez,17,http://antirez.com/articoli/tclmisunderstood.html,https://secure.gravatar.com/avatar/85002353297...,2016-11-30T10:14:24.000-06:00,False,yumaikas,578,False,I blog infrequently at https://junglecoder.com...,
1xkje1,0,https://lobste.rs/s/1xkje1/interview_4_jonatha...,2017-05-01 02:31:35,<p>Rust’s own Jonathan Turner on his backgroun...,0,2017-05-01T06:29:11.725518,1,https://lobste.rs/s/1xkje1,"[audio, javascript, rust]",🎤🎙 Interview 4 – Jonathan Turner: Part 1/3,1,http://www.newrustacean.com/show_notes/intervi...,https://secure.gravatar.com/avatar/c096ed07142...,2016-08-15T09:33:28.000-05:00,False,chriskrycho,27,False,"I'm a husband and father; a theologian, compos...",
2dasvh,19,https://lobste.rs/s/2dasvh/return_hipster_pda,2017-08-08 14:25:29,,0,2017-08-09T11:03:56.287654,20,https://lobste.rs/s/2dasvh,[practices],The Return of the Hipster PDA,20,http://www.agilesysadmin.net/return-of-the-hip...,https://lobste.rs/avatars/trn-100.png,2017-01-19T14:56:50.000-06:00,False,trn,429,False,,


### Let's check for nulls

In [None]:
stories.shape

(74, 20)

In [None]:
stories.dropna().shape

(8, 20)

In [None]:
stories.dropna(thresh=10, axis=1).shape

(74, 19)

### Exercise: which columns would be dropped?

In [None]:
set(stories.columns) - set(stories.dropna(thresh=10, axis=1).columns)


{'github_username'}

## Let's make the tags easier to use by having them as features in the columns.

In [None]:
tag_df = stories.tags.apply(pd.Series)

In [None]:
tag_df.head()

Unnamed: 0,0,1,2,3,4
09zw7r,crypto,pdf,,,
0bdne7,law,privacy,,,
1bhbod,programming,,,,
1xkje1,audio,javascript,rust,,
2dasvh,practices,,,,


In [None]:
pd.unique(tag_df.values.ravel())

array(['crypto', 'pdf', nan, 'law', 'privacy', 'programming', 'audio',
       'javascript', 'rust', 'practices', 'ruby', 'devops', 'web',
       'hardware', 'science', 'reversing', 'security', 'openbsd',
       'windows', 'design', 'compilers', 'haskell', 'c++', 'assembly',
       'games', 'math', 'release', 'event', 'netbsd', 'unix', 'c',
       'linux', 'testing', 'lua', 'job', 'video', 'philosophy', 'android',
       'networking', 'erlang', 'emacs', 'historical', 'browsers',
       'person', 'culture', 'java', 'go', 'book', 'css', 'debugging',
       'education', 'art', 'compsci', 'databases'], dtype=object)

In [None]:
set(tag_df.values.ravel())

{'android',
 'art',
 'assembly',
 'audio',
 'book',
 'browsers',
 'c',
 'c++',
 'compilers',
 'compsci',
 'crypto',
 'css',
 'culture',
 'databases',
 'debugging',
 'design',
 'devops',
 'education',
 'emacs',
 'erlang',
 'event',
 'games',
 'go',
 'hardware',
 'haskell',
 'historical',
 'java',
 'javascript',
 'job',
 'law',
 'linux',
 'lua',
 'math',
 nan,
 'netbsd',
 'networking',
 'openbsd',
 'pdf',
 'person',
 'philosophy',
 'practices',
 'privacy',
 'programming',
 'release',
 'reversing',
 'ruby',
 'rust',
 'science',
 'security',
 'testing',
 'unix',
 'video',
 'web',
 'windows'}

In [None]:
len(pd.unique(tag_df.values.ravel()))

54

In [None]:
# most common tags

Counter(tag_df.values.ravel()).most_common(5)

[(nan, 231),
 ('programming', 13),
 ('hardware', 10),
 ('security', 10),
 ('practices', 8)]

### Let's create a dummy df with our tags

In [None]:
tag_df = pd.get_dummies(tag_df.apply(pd.Series).stack()).sum()

In [None]:
tag_df.head()

android     1
art         1
assembly    3
audio       1
book        2
dtype: int64

### Now we can add it back to our stories DataFrame

In [None]:
stories = pd.concat([stories.drop('tags', axis=1), 
                     tag_df], axis=1)

In [None]:
stories.head()

Unnamed: 0,comment_count,comments_url,created_at,description,downvotes,last_updated,score,short_id_url,title,upvotes,url,avatar_url,user_created_at,is_admin,username,karma,is_moderator,about,github_username,0
09zw7r,0.0,https://lobste.rs/s/09zw7r/edited_truth,2017-08-08 20:11:09,,0.0,2017-08-09T11:03:57.014269,3.0,https://lobste.rs/s/09zw7r,The Edited Truth,3.0,https://eprint.iacr.org/2017/714.pdf,https://lobste.rs/avatars/trn-100.png,2017-01-19T14:56:50.000-06:00,False,trn,429.0,False,,,
0bdne7,17.0,https://lobste.rs/s/0bdne7/rise_social_media_v...,2017-08-08 21:12:38,,9.0,2017-08-09T11:03:57.014269,-1.0,https://lobste.rs/s/0bdne7,The Rise of The Social Media Vigilante,8.0,https://medium.com/@nkhumphreys_89452/the-rise...,https://lobste.rs/avatars/nkhumphreys-100.png,2014-07-02T06:36:39.000-05:00,False,nkhumphreys,-1.0,False,Web developer and previously embedded C developer,,
1bhbod,11.0,https://lobste.rs/s/1bhbod/tcl_misunderstood_a...,2017-04-30 20:28:52,<p>Did any language end up taking that “highly...,0.0,2017-05-01T06:29:11.725518,17.0,https://lobste.rs/s/1bhbod,Tcl the misunderstood - antirez,17.0,http://antirez.com/articoli/tclmisunderstood.html,https://secure.gravatar.com/avatar/85002353297...,2016-11-30T10:14:24.000-06:00,False,yumaikas,578.0,False,I blog infrequently at https://junglecoder.com...,,
1xkje1,0.0,https://lobste.rs/s/1xkje1/interview_4_jonatha...,2017-05-01 02:31:35,<p>Rust’s own Jonathan Turner on his backgroun...,0.0,2017-05-01T06:29:11.725518,1.0,https://lobste.rs/s/1xkje1,🎤🎙 Interview 4 – Jonathan Turner: Part 1/3,1.0,http://www.newrustacean.com/show_notes/intervi...,https://secure.gravatar.com/avatar/c096ed07142...,2016-08-15T09:33:28.000-05:00,False,chriskrycho,27.0,False,"I'm a husband and father; a theologian, compos...",,
2dasvh,19.0,https://lobste.rs/s/2dasvh/return_hipster_pda,2017-08-08 14:25:29,,0.0,2017-08-09T11:03:56.287654,20.0,https://lobste.rs/s/2dasvh,The Return of the Hipster PDA,20.0,http://www.agilesysadmin.net/return-of-the-hip...,https://lobste.rs/avatars/trn-100.png,2017-01-19T14:56:50.000-06:00,False,trn,429.0,False,,,


### Another potentially useful feature is the post times...

In [None]:
stories['created_hour'] = stories.created_at.map(
    lambda x: x.hour)

In [None]:
stories['created_dow'] = stories.created_at.map(
    lambda x: x.weekday())

### Let's analyze some of the correlations in our features so far...

In [None]:
stories[['created_hour', 'score']].corr()

Unnamed: 0,created_hour,score
created_hour,1.0,0.253917
score,0.253917,1.0


In [None]:
stories[['created_dow', 'score']].corr()

Unnamed: 0,created_dow,score
created_dow,1.0,-0.113918
score,-0.113918,1.0


In [None]:
stories[['karma', 'score']].corr()

Unnamed: 0,karma,score
karma,1.0,-0.061921
score,-0.061921,1.0


In [None]:
stories[['comment_count', 'score']].corr()

Unnamed: 0,comment_count,score
comment_count,1.0,0.637632
score,0.637632,1.0


In [None]:
stories[[ 'score']].corr()

Unnamed: 0,score
score,1.0


### We might also want/need to normalize scores. We can use a Scaler / MinMaxScaler or Normalizer

In [None]:
stories['score']=stories['score'].fillna(stories.score.mean())

In [None]:
normed_score = preprocessing.normalize(stories[['score']])

In [None]:
normed_score[:5]

array([[ 1.],
       [-1.],
       [ 1.],
       [ 1.],
       [ 1.]])

#### hmm... maybe a min-max scaler works better for our needs!

In [None]:
scaler = preprocessing.MinMaxScaler()

In [None]:
scaled_score = scaler.fit_transform(stories[['score']])

In [None]:
scaled_score[:5]

array([[0.07272727],
       [0.        ],
       [0.32727273],
       [0.03636364],
       [0.38181818]])

In [None]:
stories['scaled_score'] = scaled_score[:,0]
stories['scaled_score']

09zw7r     0.072727
0bdne7     0.000000
1bhbod     0.327273
1xkje1     0.036364
2dasvh     0.381818
             ...   
testing    0.155037
unix       0.155037
video      0.155037
web        0.155037
windows    0.155037
Name: scaled_score, Length: 127, dtype: float64