In [1]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
PARENT = '/Users/shreydesai/GitHub/niche'
CATEGORIES = ['entertainment', 'sports', 'fun', 'games', 
              'weather', 'science', 'technology', 'politics']

## Part 1: Reading text dataset into pandas
- Each category has a list of fileids
- Each fileid represents a Twitter account with ~3000 tweets
- Need to read into a tuple of (category, sent, index)

In [3]:
def fileids(category):
    """Gets a list of file IDs for each category"""
    path = os.path.join(PARENT, 'corpus', 'processed', category)
    dir_files = set(os.listdir(path))
    blacklist = set(['.DS_Store'])
    return list(dir_files - blacklist)

def read_sents(file):
    """Reads sentences from specified file"""
    f = open(file, 'r', encoding='ISO-8859-1').read().strip()
    return [sent.strip() for sent in f.split('\n')]

data = []

for index, category in enumerate(CATEGORIES):
    for fileid in fileids(category):
        path = os.path.join(PARENT, 'corpus', 'processed', 
                            category, fileid)
        sents = read_sents(path)
        data.extend([(category, sent, index) for sent in sents])

In [4]:
df = pd.DataFrame().from_records(data)
df.columns = ['category', 'tweet', 'index']
df.head()

Unnamed: 0,category,tweet,index
0,entertainment,meghan markle's acting past comes to light see...,0
1,entertainment,zendaya talks about her growing fashion empire...,0
2,entertainment,this is us milo ventimiglia mandy moore on tha...,0
3,entertainment,beauty and the beast becomes fandango's top fa...,0
4,entertainment,angelina jolie gives her first college lecture...,0


In [5]:
df.category.value_counts()

entertainment    46393
science          46082
politics         43354
technology       42912
sports           40199
weather          39101
fun              38829
games            31301
Name: category, dtype: int64

In [6]:
# define X and y for scikit-learn
X = df.tweet
y = df.index
print(X.shape, y.shape)

(328171,) (328171,)


In [7]:
# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(246128,) (82043,) (246128,) (82043,)


## Part 2: Vectorizing the dataset

In [None]:
vect = CountVectorizer(max_df=0.5)
# fit/transform training data
X_train_dm = vect.fit_transform(X_train)
# fit/trasnform test data
X_test_dm = vect.transform(X_test)

## Part 3: Building/evaluating the model

In [None]:
nb = MultinomialNB()
# train model with X_train_dtm 
nb.fit(X_train_dm, y_train)
# make predictions for X_test_dtm
predictions = nb.predict(X_test_dm)

In [None]:
metrics.accuracy_score(y_test, predictions)

In [None]:
metrics.confusion_matrix(y_test, predictions)