# Project: Natural Language Processing for Stock News Analysis

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [31]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [4]:
# Read in the data
data = pd.read_csv('Full_Data.csv')
data.head(1)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title


#### Split Data

In [5]:
train = data[data['Date'] < '20150101']
test = data[data['Date'] > '20141231']

In [6]:
# Removing punctuations
slicedData= train.iloc[:,2:27]
slicedData.replace(to_replace="[^a-zA-Z]", value=" ", regex=True, inplace=True)

# Renaming column names for ease of access
list1= [i for i in range(25)]
new_Index=[str(i) for i in list1]
slicedData.columns= new_Index
slicedData.head(5)

# Convertng headlines to lower case
for index in new_Index:
    slicedData[index]=slicedData[index].str.lower()
slicedData.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,a hindrance to operations extracts from the...,scorecard,hughes instant hit buoys blues,jack gets his skates on at ice cold alex,chaos as maracana builds up for united,depleted leicester prevail as elliott spoils e...,hungry spurs sense rich pickings,gunners so wide of an easy target,derby raise a glass to strupar s debut double,southgate strikes leeds pay the penalty,...,flintoff injury piles on woe for england,hunters threaten jospin with new battle of the...,kohl s successor drawn into scandal,the difference between men and women,sara denver nurse turned solicitor,diana s landmine crusade put tories in a panic,yeltsin s resignation caught opposition flat f...,russian roulette,sold out,recovering a title


In [7]:
headlines = []
for row in range(0,len(slicedData.index)):
    headlines.append(' '.join(str(x) for x in slicedData.iloc[row,0:25]))

In [8]:
headlines[0]

'a  hindrance to operations   extracts from the leaked reports scorecard hughes  instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes  leeds pay the penalty hammers hand robson a youthful lesson saints party like it s      wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit     flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver  nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'

#### Prepare Data

In [23]:
basicvectorizer_list = []
basictrain_list = []
basictest_list = []

In [24]:
basicvectorizer_list.append(CountVectorizer(ngram_range=(1,1)))
basictrain_list.append(basicvectorizer_list[0].fit_transform(headlines))

testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
basictest_list.append(basicvectorizer_list[0].transform(testheadlines))

In [25]:
basicvectorizer_list.append(CountVectorizer(ngram_range=(2,2)))
basictrain_list.append(basicvectorizer_list[1].fit_transform(headlines))

testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
basictest_list.append(basicvectorizer_list[1].transform(testheadlines))

In [26]:
basicvectorizer_list.append(CountVectorizer(ngram_range=(3,3)))
basictrain_list.append(basicvectorizer_list[2].fit_transform(headlines))

testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
basictest_list.append(basicvectorizer_list[2].transform(testheadlines))

#### Test Classifiers

In [76]:
classifiers = [LogisticRegression(random_state=42),
               XGBClassifier(),
               RandomForestClassifier(),
               LinearSVC(),
               DecisionTreeClassifier()]

In [77]:
for clf in classifiers:
    
    clf_name = clf.__class__.__name__
    print(f"---> Algorithm: {clf_name} <---")
    
    for basicvectorizer, basictrain, basictest in zip(basicvectorizer_list, basictrain_list, basictest_list):
    
        basicmodel = clf
        basicmodel = basicmodel.fit(basictrain, train["Label"])
    
        predictions = basicmodel.predict(basictest)
    
        print(f"Confussion Matrix for: {basicvectorizer}") 
        print("----------------------------------------------------------")
        print(pd.crosstab(test["Label"], predictions, rownames=["Actual"], colnames=["Predicted"]))
    
        print("Classification report:")
        print("----------------------")
        print(classification_report(test["Label"], predictions))
        print(f"Accuracy score: {accuracy_score(test['Label'], predictions)}")
    
        print("----------------------------------------------------------")

---> Algorithm: LogisticRegression <---
Confussion Matrix for: CountVectorizer()
----------------------------------------------------------
Predicted    0    1
Actual             
0          150   36
1           30  162
Classification report:
----------------------
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       186
           1       0.82      0.84      0.83       192

    accuracy                           0.83       378
   macro avg       0.83      0.83      0.83       378
weighted avg       0.83      0.83      0.83       378

Accuracy score: 0.8253968253968254
----------------------------------------------------------
Confussion Matrix for: CountVectorizer(ngram_range=(2, 2))
----------------------------------------------------------
Predicted    0    1
Actual             
0          159   27
1           27  165
Classification report:
----------------------
              precision    recall  f1-score   support

           0  

#### GridSearch Tunning

In [72]:
# Models for GridSearchCV

models = [

          {'name': 'Logistic Regression','label': 'Logistic Regression',
           'classifier': LogisticRegression(random_state=42),
           'grid': {'penalty':['l2'],
                    'C':[5],
                    'solver':['newton-cg','lbfgs'],
                    'max_iter':[100],
                    'multi_class':['ovr','multinomial']}},
          
          {'name': 'Xgboost','label':'Xgboost',
           'classifier': XGBClassifier(verbosity = 0, silent=True, random_state=42),
           'grid':{'seed':[0],
                   'n_estimators':[5,10],
                   'learning_rate':[0.1],
                   'subsample':[0.8, 0.9],
                   'objective':['binary:logistic'],
                   'max_depth':[2,3,4],
                   'gamma':[1,2,3],
                   'min_child_weight':[2,3,4]}},
       
          {'name': 'RandomForestClassifier','label':'RandomForestClassifier',
           'classifier': RandomForestClassifier(random_state=42), 
           'grid':{'max_depth'   : [3, 10, 20],
                   'min_samples_leaf': [0.05, 0.1],
                   'max_features': ['sqrt', 'log2'],
                   'ccp_alpha': [0, 0.01]}},
                  
          {'name': 'LinearSVC','label':'LinearSVC',
           'classifier': LinearSVC(random_state=42),
           'grid':{'C': [.1, 1]}},
    
         {'name': 'DecisionTreeClassifier', 'label': 'DecisionTreeClassifier',
          'classifier': DecisionTreeClassifier(random_state=42),
          'grid': {'max_leaf_nodes': list(range(2, 10)),
                   'min_samples_split': [2, 4]}}, 
    
         ]

In [73]:
def model_selection(classifier, name, grid, X_train, y_train, X_test, y_test, scoring1, scoring2):
    
    gridsearch_cv1=GridSearchCV(classifier, 
                               grid,
                               cv=5, 
                               scoring = scoring1)
    
    gridsearch_cv2=GridSearchCV(classifier, 
                               grid,
                               cv=5, 
                               scoring = scoring2)
    
    gridsearch_cv1.fit(X_train, y_train)
    gridsearch_cv2.fit(X_train, y_train)
    
    results_dict = {} # Scores with data training
    
    results_dict['classifier_name'] = name    
    results_dict['classifier'] = gridsearch_cv1.best_estimator_
    results_dict['best_params'] = gridsearch_cv1.best_params_
    
    # Traininig Scores
    results_dict['accuracy-train'] = gridsearch_cv1.best_score_
    results_dict['f1 score-train'] = gridsearch_cv2.best_score_

    # Test Scores    
    y_pred = gridsearch_cv1.best_estimator_.predict(X_test)
    results_dict['accuracy-test'] = accuracy_score(y_test, y_pred) #, 'accuracy')
    
    y_pred = gridsearch_cv2.best_estimator_.predict(X_test)
    results_dict['f1 score-test'] = f1_score(y_test, y_pred, average='weighted')
    
    return(results_dict)

results = []

for mod in models:    
    print(mod['name'], ".....")    
    
    for basicvectorizer, basictrain, basictest in zip(basicvectorizer_list, basictrain_list, basictest_list):
    
        X_train = basictrain
        y_train = train['Label']
        X_test = basictest
        y_test = test['Label']
        
        results.append(model_selection(mod['classifier'], 
                                       mod['name'],
                                       mod['grid'],
                                       X_train, 
                                       y_train, 
                                       X_test,
                                       y_test,
                                       'accuracy',
                                       'f1_weighted'))      
        print('      ....ready ', basicvectorizer)
        
    print('....ready ', mod['name'])

Logistic Regression .....
      ....ready  CountVectorizer()
      ....ready  CountVectorizer(ngram_range=(2, 2))
      ....ready  CountVectorizer(ngram_range=(3, 3))
....ready  Logistic Regression
Xgboost .....
      ....ready  CountVectorizer()
      ....ready  CountVectorizer(ngram_range=(2, 2))
      ....ready  CountVectorizer(ngram_range=(3, 3))
....ready  Xgboost
RandomForestClassifier .....
      ....ready  CountVectorizer()
      ....ready  CountVectorizer(ngram_range=(2, 2))
      ....ready  CountVectorizer(ngram_range=(3, 3))
....ready  RandomForestClassifier
LinearSVC .....
      ....ready  CountVectorizer()
      ....ready  CountVectorizer(ngram_range=(2, 2))
      ....ready  CountVectorizer(ngram_range=(3, 3))
....ready  LinearSVC
DecisionTreeClassifier .....
      ....ready  CountVectorizer()
      ....ready  CountVectorizer(ngram_range=(2, 2))
      ....ready  CountVectorizer(ngram_range=(3, 3))
....ready  DecisionTreeClassifier


In [None]:
results_df = pd.DataFrame(results).sort_values(by='accuracy-test', ascending = False)
print(results_df)
print(results_df[['classifier_name','accuracy-train','accuracy-test','f1 score-train','f1 score-test']])

In [75]:
lr_params = results_df[results_df['classifier_name'] == 'Logistic Regression'].best_params.values
xgb_params = results_df[results_df['classifier_name'] == 'Xgboost'].best_params.values
rf_params = results_df[results_df['classifier_name'] == 'RandomForestClassifier'].best_params.values
lsvc_params = results_df[results_df['classifier_name'] == 'LinearSVC'].best_params.values
dst_params = results_df[results_df['classifier_name'] == 'DecisionTreeClassifier'].best_params.values

## Best Final Score:
#### DecisionTreeClassifier : Accuracy score: 0.968904828043348
#### RandomForestClassifier : Accuracy score: 0.953514258991477
#### XGBClassifier          : Accuracy score: 0.951157452412595