In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from nltk.stem import SnowballStemmer

In [2]:
# function for reading the data files
def build_data_frame(path):
    rows = []
    index = []
    classification =[]
    for file_name, text, classification in read_files(path):
        rows.append({'text':text, 'class': classification})
        index.append(file_name)
        
    data_frame = pd.DataFrame(rows,index=index)
    return data_frame

In [3]:
def read_files(path):
    newline=''
    for root, dir_names, file_names in os.walk(path):
        print('Root folder: {0}'.format(root))
        print('Number of files read: {0}'.format(len(file_names)))
        for file_name in file_names:
            file_path = os.path.join(root,file_name)
            if(os.path.isfile(file_path)):
                #print(file_name)
                if("D" in file_name):
                    label="D"
                elif("R" in file_name):
                    label="R"
                else:
                    label="X"
                lines = []
                f = open(file_path)
                for line in f:
                    lines.append(line.rstrip("\n"))
                f.close()
                content=newline.join(lines)
                yield file_name, content, label

In [4]:
# here I set the path of data set using os.getcwd()
path = os.path.join(os.getcwd(), 'data_set')
# intitalize the empty data frame
data = pd.DataFrame({'text':[],'class':[]})
# call the function to build the data set
data=data.append(build_data_frame(path))

Root folder: C:\Users\abhis\AnacondaProjects\Speech-classification\data_set
Number of files read: 856


### Stemming using SnowballStemmer

In [50]:
rows =[]
index=[]
for i in range(len(data)):
    stemtext =SnowballStemmer('english').stem(data['text'][i])
    rows.append({'stemedtext':stemtext,'class':data['class'][i]})
    index.append(data.index[i])

datastemmed=pd.DataFrame(rows, index=index)


### checking the head and tail to ensure there is no problem with stemming of data

In [51]:
datastemmed.head()

Unnamed: 0,class,stemedtext
048_400027_0297016_RMY.txt,R,"mr. chairman , i rise today in support of h.r...."
048_400029_0294001_ROY.txt,R,"mr. speaker , by direction of the committee on..."
048_400029_0294013_ROY.txt,R,"mr. speaker , i reserve the balance of my time..."
048_400029_0294015_ROY.txt,R,"mr. speaker , i yield myself such time as i ma..."
048_400029_0294019_ROY.txt,R,"mr. speaker , i yield back the balance of my t..."


In [49]:
data.head()

Unnamed: 0,class,text
048_400027_0297016_RMY.txt,R,"mr. chairman , i rise today in support of h.r...."
048_400029_0294001_ROY.txt,R,"mr. speaker , by direction of the committee on..."
048_400029_0294013_ROY.txt,R,"mr. speaker , i reserve the balance of my time..."
048_400029_0294015_ROY.txt,R,"mr. speaker , i yield myself such time as i ma..."
048_400029_0294019_ROY.txt,R,"mr. speaker , i yield back the balance of my t..."


In [52]:
datastemmed.tail()

Unnamed: 0,class,stemedtext
599_400328_2990021_ROY.txt,R,"mr. speaker , i yield myself such time as i ma..."
599_400328_2990023_ROY.txt,R,"mr. speaker , i yield myself such time as i ma..."
599_400328_2990027_ROY.txt,R,"mr. speaker , reclaiming my time , the gentlem..."
599_400328_2990029_ROY.txt,R,"mr. speaker , i yield back the balance of my t..."
599_400328_2990030_ROY.txt,R,"mr. speaker , on that i demand the yeas and na..."


In [48]:
data.tail()

Unnamed: 0,class,text
599_400328_2990021_ROY.txt,R,"mr. speaker , i yield myself such time as i ma..."
599_400328_2990023_ROY.txt,R,"mr. speaker , i yield myself such time as i ma..."
599_400328_2990027_ROY.txt,R,"mr. speaker , reclaiming my time , the gentlem..."
599_400328_2990029_ROY.txt,R,"mr. speaker , i yield back the balance of my t..."
599_400328_2990030_ROY.txt,R,"mr. speaker , on that i demand the yeas and na..."


### Let's beging the classification part on stemmed data

In [92]:
#Split data into train and test
train_data, test_data = train_test_split(datastemmed, test_size = 0.2, stratify = datastemmed['class'])

In [93]:
# using Pipeline 
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('classifier', MultinomialNB())
])
pipeline.fit(train_data['stemedtext'],train_data['class'])
predicitions = pipeline.predict(test_data['stemedtext'])
print(classification_report(test_data['class'],predicitions))


             precision    recall  f1-score   support

          D       0.87      0.53      0.66        78
          R       0.70      0.94      0.80        94

avg / total       0.78      0.75      0.74       172



### classification on non stemmed data

In [94]:
#Split data into train and test
train_data, test_data = train_test_split(data, test_size = 0.2, stratify = data['class'])

In [95]:
# using Pipeline 
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('classifier', MultinomialNB())
])
pipeline.fit(train_data['text'],train_data['class'])
predicitions = pipeline.predict(test_data['text'])
print(classification_report(test_data['class'],predicitions))


             precision    recall  f1-score   support

          D       0.78      0.55      0.65        78
          R       0.70      0.87      0.78        94

avg / total       0.74      0.73      0.72       172



In [76]:
# when i compared samples of stemmed text with original text I found both of them to be similar that's why 
# I think there is no improvement in the score