In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, make_scorer, f1_score


In [2]:
# function for reading the data files
def build_data_frame(path):
    rows = []
    index = []
    classification =[]
    for file_name, text, classification in read_files(path):
        rows.append({'text':text, 'class': classification})
        index.append(file_name)
        
    data_frame = pd.DataFrame(rows,index=index)
    return data_frame

In [3]:
def read_files(path):
    newline=''
    for root, dir_names, file_names in os.walk(path):
        print('Root folder: {0}'.format(root))
        print('Number of files read: {0}'.format(len(file_names)))
        for file_name in file_names:
            file_path = os.path.join(root,file_name)
            if(os.path.isfile(file_path)):
                #print(file_name)
                if("D" in file_name):
                    label="D"
                elif("R" in file_name):
                    label="R"
                else:
                    label="X"
                lines = []
                f = open(file_path)
                for line in f:
                    lines.append(line.rstrip("\n"))
                f.close()
                content=newline.join(lines)
                yield file_name, content, label

In [4]:
# here I set the path of data set using os.getcwd()
path = os.path.join(os.getcwd(), 'data_set')
# intitalize the empty data frame
data = pd.DataFrame({'text':[],'class':[]})
# call the function to build the data set
data=data.append(build_data_frame(path))

Root folder: /home/pk-user/Documents/IE594-Data-Science/Speech-classification/data_set
Number of files read: 856


In [5]:
#Create N-gram features
x_train, x_test, y_train, y_test = train_test_split(data['text'],data['class'],test_size=0.1,random_state=2,stratify=data['class'])
count_vectorizer = CountVectorizer(ngram_range=(1, 4))
x_train = count_vectorizer.fit_transform(x_train)
x_test = count_vectorizer.transform(x_test)

In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


In [7]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),learning_rate=0.1,n_estimators= 200)
bdt.fit(x_train,y_train)
y_pred = bdt.predict(x_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          D       0.70      0.54      0.61        39
          R       0.68      0.81      0.74        47

avg / total       0.69      0.69      0.68        86



In [8]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),learning_rate=0.01,n_estimators= 200)
bdt.fit(x_train,y_train)
y_pred = bdt.predict(x_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          D       0.65      0.33      0.44        39
          R       0.61      0.85      0.71        47

avg / total       0.63      0.62      0.59        86



In [9]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),learning_rate=0.001,n_estimators= 200)
bdt.fit(x_train,y_train)
y_pred = bdt.predict(x_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          D       0.56      0.26      0.35        39
          R       0.57      0.83      0.68        47

avg / total       0.57      0.57      0.53        86



In [10]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),learning_rate=0.1,n_estimators= 200)
bdt.fit(x_train,y_train)
y_pred = bdt.predict(x_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          D       0.66      0.69      0.68        39
          R       0.73      0.70      0.72        47

avg / total       0.70      0.70      0.70        86



In [11]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),learning_rate=0.01,n_estimators= 200)
bdt.fit(x_train,y_train)
y_pred = bdt.predict(x_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          D       0.57      0.41      0.48        39
          R       0.60      0.74      0.67        47

avg / total       0.59      0.59      0.58        86



In [12]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),learning_rate=0.001,n_estimators= 200)
bdt.fit(x_train,y_train)
y_pred = bdt.predict(x_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          D       0.57      0.31      0.40        39
          R       0.58      0.81      0.68        47

avg / total       0.58      0.58      0.55        86

