# Project 3: Web APIs & NLP

## Data Cleaning

In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB


In [114]:
cfb = pd.read_csv('./data/cfb.csv')
nfl = pd.read_csv('./data/nfl.csv')

In [115]:
cfb.head()

Unnamed: 0.1,Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,...,secure_media,secure_media_embed,author_cakeday,crosspost_parent,crosspost_parent_list,distinguished,author_flair_template_id,banned_by,suggested_sort,edited
0,0,[],False,ChevyImpalaSS,,[],,text,t2_2qa2dawe,False,...,,,,,,,,,,
1,1,[],False,lebaronslebaron,arizona-sheet1-row05-col01-player-2shtl-2rw04-...,"[{'a': ':arizona:', 'e': 'emoji', 'u': 'https:...",:arizona: :player: Arizona • Verified Player,richtext,t2_dohf4,False,...,,,,,,,,,,
2,2,[],False,HandwovenBox,byu,"[{'a': ':byu:', 'e': 'emoji', 'u': 'https://em...",:byu: BYU,richtext,t2_64enz,False,...,,,,,,,,,,
3,3,[],False,derbra,florida3-sheet1-row06-col21-orange-2shtl-2rw16...,"[{'a': ':florida3:', 'e': 'emoji', 'u': 'https...",:florida3: :orange: Florida • Orange Bowl,richtext,t2_iuylg,False,...,,,,,,,,,,
4,4,[],False,johanspot,colorado-sheet1-row05-col04-chaos-2shtl-2rw02-...,"[{'a': ':colorado:', 'e': 'emoji', 'u': 'https...",:colorado: :chaos: Colorado • Team Chaos,richtext,t2_7qsl8,False,...,,,,,,,,,,


In [116]:
nfl.head()

Unnamed: 0.1,Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,crosspost_parent_list,removed_by_category,gallery_data,is_gallery,media_metadata,suggested_sort,author_cakeday,banned_by,link_flair_template_id,edited
0,0,[],False,stonerkid10,saints,"[{'e': 'text', 't': 'Saints'}]",94b30cfa-9b4a-11e2-9c94-12313d259cae,Saints,dark,richtext,...,,,,,,,,,,
1,1,[],False,Mormonster,rams,"[{'e': 'text', 't': 'Rams'}]",,Rams,dark,richtext,...,,,,,,,,,,
2,2,[],False,Happy_Huntington,jets,"[{'e': 'text', 't': 'Jets'}]",971c30fc-9b4a-11e2-b67a-12313d1841d1,Jets,dark,richtext,...,,,,,,,,,,
3,3,[],False,SaladinsSaladbar,fortyniners,"[{'e': 'text', 't': '49ers'}]",9dd9bfb8-9b4a-11e2-bcb0-12313d169640,49ers,dark,richtext,...,,,,,,,,,,
4,4,[],False,MrAmericanIdiot,raiders,"[{'e': 'text', 't': 'Raiders'}]",9a089c06-9b4a-11e2-a4c2-12313b06caaf,Raiders,dark,richtext,...,,,,,,,,,,


In [117]:
cfb['title'].isnull().sum()

0

In [118]:
nfl['title'].isnull().sum()

0

In [119]:
print(cfb.shape)
print(nfl.shape)

(2000, 81)
(2000, 83)


In [120]:
cfb[cfb['title'] == '']

Unnamed: 0.1,Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,...,secure_media,secure_media_embed,author_cakeday,crosspost_parent,crosspost_parent_list,distinguished,author_flair_template_id,banned_by,suggested_sort,edited


In [121]:
nfl[nfl['title'] == '']

Unnamed: 0.1,Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,crosspost_parent_list,removed_by_category,gallery_data,is_gallery,media_metadata,suggested_sort,author_cakeday,banned_by,link_flair_template_id,edited


In [122]:
# Only using the title and subreddit for our models
cfb = cfb[['subreddit','title']]
nfl = nfl[['subreddit','title']]

In [123]:
cfb[:5]

Unnamed: 0,subreddit,title
0,CFB,NCAA Football 2020 #5 FLORIDA VS MISSISSIPPI
1,CFB,Arizona LB Colin Schooler Announces Transfer
2,CFB,Texas State to Play at BYU for 2020 Schedule
3,CFB,2023 4* CB Shawn Russ commits to Florida
4,CFB,"Opinion: As COVID-19 grips more campuses, coll..."


In [124]:
nfl[:5]

Unnamed: 0,subreddit,title
0,nfl,What are your way-too-early playoff predictions?
1,nfl,A detailed breakdown of the turf at SoFi Stadium
2,nfl,Dolphins express concerns about opponents usin...
3,nfl,"[Acho] Commissioner of the @NFL, Roger Goodell..."
4,nfl,[CBS Sports] Jaguars GM Dave Caldwell said the...


In [125]:
# Check for duplicates in the dataframes
cfb.drop_duplicates(subset = 'title')
print(f'There are {len(cfb)} values in cfb')

nfl.drop_duplicates(subset = 'title')
print(f'There are {len(cfb)} values in nfl')

There are 2000 values in cfb
There are 2000 values in nfl


In [126]:

data = pd.concat([cfb, nfl])

In [127]:
data

Unnamed: 0,subreddit,title
0,CFB,NCAA Football 2020 #5 FLORIDA VS MISSISSIPPI
1,CFB,Arizona LB Colin Schooler Announces Transfer
2,CFB,Texas State to Play at BYU for 2020 Schedule
3,CFB,2023 4* CB Shawn Russ commits to Florida
4,CFB,"Opinion: As COVID-19 grips more campuses, coll..."
...,...,...
1995,nfl,Crown royal develops seasonal depression cure ...
1996,nfl,[Rapoport] One name to watch as teams gear up ...
1997,nfl,[Rapoport] One name to watch as teams gear up ...
1998,nfl,[Highlight] Falcons punter Matt Bosher body sl...


In [128]:
data['subreddit'] = (data['subreddit']=='CFB').astype(int)
data.head()

Unnamed: 0,subreddit,title
0,1,NCAA Football 2020 #5 FLORIDA VS MISSISSIPPI
1,1,Arizona LB Colin Schooler Announces Transfer
2,1,Texas State to Play at BYU for 2020 Schedule
3,1,2023 4* CB Shawn Russ commits to Florida
4,1,"Opinion: As COVID-19 grips more campuses, coll..."


In [129]:
data.tail()

Unnamed: 0,subreddit,title
1995,0,Crown royal develops seasonal depression cure ...
1996,0,[Rapoport] One name to watch as teams gear up ...
1997,0,[Rapoport] One name to watch as teams gear up ...
1998,0,[Highlight] Falcons punter Matt Bosher body sl...
1999,0,How many NFC south teams will make the playoffs?


In [132]:
data.to_csv('data/data.csv', index = False)