In [150]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [151]:
list1 = []
with open("trainingdata.txt") as file:
    for line in file:
        inner_list = [line.strip() for line in line.split('\n')]
        #print(inner_list)
        list1.append(inner_list)

In [152]:
number_of_docs = (list1[0])
print(number_of_docs)

['5485', '']


In [153]:
df = pd.DataFrame(list1)
print (df.head())

                                                   0 1
0                                               5485  
1  1 champion products ch approves stock split ch...  
2  2 computer terminal systems cpml completes sal...  
3  1 cobanco inc cbco year net shr cts vs dlrs ne...  
4  1 am international inc am nd qtr jan oper shr ...  


In [154]:
df = df.drop([0])

In [155]:
new_df = df[0].str.split(" ", n = 1, expand = True)

In [156]:
new_df.columns = ['label','text']
new_df

Unnamed: 0,label,text
1,1,champion products ch approves stock split cham...
2,2,computer terminal systems cpml completes sale ...
3,1,cobanco inc cbco year net shr cts vs dlrs net ...
4,1,am international inc am nd qtr jan oper shr lo...
5,1,brown forman inc bfd th qtr net shr one dlr vs...
6,1,dean foods df sees strong th qtr earnings dean...
7,1,brown forman bfdb sets stock split ups payout ...
8,1,esquire radio and electronics inc ee th qtr sh...
9,1,united presidential corp upco th qtr net shr c...
10,1,owens and minor inc obod raises qtly dividend ...


In [157]:
new_df['label'] = new_df['label'].astype('category')

In [158]:
new_df['text'] = new_df['text'].astype('str')

In [159]:
new_df['text'] = new_df['text'].str.strip()
new_df

Unnamed: 0,label,text
1,1,champion products ch approves stock split cham...
2,2,computer terminal systems cpml completes sale ...
3,1,cobanco inc cbco year net shr cts vs dlrs net ...
4,1,am international inc am nd qtr jan oper shr lo...
5,1,brown forman inc bfd th qtr net shr one dlr vs...
6,1,dean foods df sees strong th qtr earnings dean...
7,1,brown forman bfdb sets stock split ups payout ...
8,1,esquire radio and electronics inc ee th qtr sh...
9,1,united presidential corp upco th qtr net shr c...
10,1,owens and minor inc obod raises qtly dividend ...


In [160]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5485 entries, 1 to 5485
Data columns (total 2 columns):
label    5485 non-null category
text     5485 non-null object
dtypes: category(1), object(1)
memory usage: 91.4+ KB


### First Model try with no feature engineering

In [161]:
X = new_df['text']
y = new_df['label']

### Random Forest Classifier

In [162]:
rf = RandomForestClassifier(n_estimators=100)  # instantiate the estimator

In [163]:
X = CountVectorizer().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

In [164]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [165]:
y_pred = rf.predict(X_test) #predicting to test on the 20% test set
print(y_pred)

['2' '1' '1' ... '2' '1' '1']


In [166]:
accuracy = accuracy_score(y_pred,y_test)
print(accuracy)

0.8484503190519599


In [167]:
class_res = classification_report(y_pred,y_test)
print(class_res)

             precision    recall  f1-score   support

          1       0.95      0.95      0.95      2251
          2       0.99      0.70      0.82      1793
          3       0.56      0.93      0.70       122
          4       0.01      1.00      0.02         1
          5       0.00      0.00      0.00         0
          6       0.34      1.00      0.51        71
          7       0.64      0.92      0.75       106
          8       0.24      0.98      0.39        44

avg / total       0.93      0.85      0.87      4388



  'recall', 'true', average, warn_for)


In [168]:
cm = confusion_matrix(y_pred,y_test)
print(cm)

[[2148    8   18    8   10   32   16   11]
 [ 119 1249   72   82   21  105   37  108]
 [   0    0  114    0    1    0    0    7]
 [   0    0    0    1    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0   71    0    0]
 [   0    0    1    0    0    0   97    8]
 [   0    0    0    0    0    0    1   43]]


In [169]:
# most common word
pd.Series(' '.join(new_df['text']).split()).value_counts()[:100]

the              27302
of               16258
to               15256
in               11462
said             11069
a                11068
and              10880
mln               9960
vs                9162
dlrs              6626
s                 6292
for               5920
it                5801
cts               5572
reuter            5012
net               4376
its               4146
loss              3916
on                3836
year              3621
pct               3360
from              3345
is                3187
that              3148
company           2999
by                2809
will              2735
inc               2713
with              2698
be                2621
                 ...  
note              1034
quarter           1020
per               1015
offer             1007
first              968
market             946
oper               932
they               886
up                 883
record             869
dividend           858
ltd                857
been       

### KNN

In [170]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [171]:
y_pred_knn = knn.predict(X_test)

In [172]:
print(accuracy_score(y_pred_knn,y_test))

0.8181403828623519


In [173]:
print(classification_report(y_pred_knn,y_test))

             precision    recall  f1-score   support

          1       0.94      0.87      0.91      2456
          2       0.88      0.77      0.82      1431
          3       0.55      0.71      0.62       157
          4       0.08      0.35      0.13        20
          5       0.06      0.67      0.11         3
          6       0.29      0.94      0.44        64
          7       0.71      0.59      0.64       182
          8       0.34      0.80      0.48        75

avg / total       0.87      0.82      0.84      4388



In [174]:
print(confusion_matrix(y_pred_knn,y_test))


[[2141  123   37   39   14   64   10   28]
 [ 115 1101   49   35   10   58   20   43]
 [   1    6  112    1    1   12    3   21]
 [   0    2    1    7    0    8    2    0]
 [   0    0    0    0    2    1    0    0]
 [   0    2    1    0    1   60    0    0]
 [   8   23    3    8    3    5  107   25]
 [   2    0    2    1    1    0    9   60]]
