# Load Required Libraries

In [1]:
import numpy as np
import pandas as pd
from scikitplot.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import roc_auc_score,log_loss
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
data = pd.read_excel('D:\\DBDA Project\\Excels\\AllDataProcessed2.xlsx')
print(data.head(10))

        Date company                                               news  \
0 2019-01-14     HCL  HCL Tech Q3 PAT seen up 4.2% QoQ to Rs. 2,629....   
1 2018-12-07     HCL  HCL Technologies-IBM deal fails to enthuse inv...   
2 2018-12-06     HCL  Technical Views | Top buy & sell ideas by Ashw...   
3 2018-11-09     HCL  HCL Technologies Limited Q2 FY’19 Earnings Con...   
4 2018-10-29     HCL  Buy HCL Technologies, target Rs 1182: Anand Rathi   
5 2018-10-24     HCL  HCL Technologies – good outlook backed by reas...   
6 2018-10-23     HCL  HCL Technologies sees 5.7% sequential rise in ...   
7 2018-10-22     HCL  HCL Technologies Q2 results on October 23; her...   
8 2018-10-15     HCL  Stock Picks of the Day | Nifty may be vulnerab...   
9 2018-10-08     HCL  HCL Tech to set up global IT centres in Andhra...   

   loss/profit   %change                                     Processed_news  
0            0 -0.472399   hcl tech q3 pat see up 4 2 % qoq 7 dolat capital  
1            0 -3.

In [3]:
X = data["Processed_news"]
y = data["loss/profit"]

In [4]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2019)

# Function to call various Algorithm

In [5]:
def evaluate_features(X, y, clf=None):
    """General helper function for evaluating effectiveness of passed features in ML model
    
    Prints out Log loss, accuracy, and confusion matrix with 3-fold stratified cross-validation
    
    Args:
        X (array-like): Features array. Shape (n_samples, n_features)
        
        y (array-like): Labels array. Shape (n_samples,)
        
        clf: Classifier to use. If None, default Log reg is use.
    """
    if clf is None:
        clf = LogisticRegression()
    
    probas = cross_val_predict(clf, X, y, cv=KFold(random_state=2019,n_splits=6), 
                              n_jobs=-1, method='predict_proba')
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]
    print('Log loss: {}'.format(log_loss(y, probas)))
    print('Accuracy: {}'.format(accuracy_score(y, preds)))
    print('Classification report',classification_report(y,preds))
    print('Confusion Matrix  ', confusion_matrix(y,preds))

# Using Count Vectorizer

In [6]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))  

bag_of_words = count_vectorizer.fit_transform(data['Processed_news'])
    
len(count_vectorizer.get_feature_names())

38525

In [7]:
evaluate_features(bag_of_words, data['loss/profit'])

Log loss: 0.7995815259357771
Accuracy: 0.5456415809420683
Classification report               precision    recall  f1-score   support

           0       0.55      0.58      0.57      4738
           1       0.54      0.51      0.52      4497

   micro avg       0.55      0.55      0.55      9235
   macro avg       0.54      0.54      0.54      9235
weighted avg       0.55      0.55      0.55      9235

Confusion Matrix   [[2757 1981]
 [2215 2282]]


In [8]:
evaluate_features(bag_of_words, data['loss/profit'], 
                  RandomForestClassifier(n_estimators=300, max_depth=30,random_state=2019))

Log loss: 0.6900467547121742
Accuracy: 0.5405522468868436
Classification report               precision    recall  f1-score   support

           0       0.53      0.82      0.65      4738
           1       0.56      0.25      0.35      4497

   micro avg       0.54      0.54      0.54      9235
   macro avg       0.55      0.53      0.50      9235
weighted avg       0.55      0.54      0.50      9235

Confusion Matrix   [[3873  865]
 [3378 1119]]


In [9]:
evaluate_features(bag_of_words, data['loss/profit'],SVC(kernel='linear',probability=True,random_state=2019))

Log loss: 0.6894828005718083
Accuracy: 0.5390362750406064
Classification report               precision    recall  f1-score   support

           0       0.54      0.76      0.63      4738
           1       0.55      0.31      0.39      4497

   micro avg       0.54      0.54      0.54      9235
   macro avg       0.54      0.53      0.51      9235
weighted avg       0.54      0.54      0.51      9235

Confusion Matrix   [[3591 1147]
 [3110 1387]]


# Using TF-IDF

In [10]:
count_vectorizer = TfidfVectorizer()    

tfidf = count_vectorizer.fit_transform(data['Processed_news'])

len(count_vectorizer.get_feature_names())

1749

In [11]:
evaluate_features(tfidf, data['loss/profit'])

Log loss: 0.7009660189832868
Accuracy: 0.5378451543042772
Classification report               precision    recall  f1-score   support

           0       0.55      0.59      0.57      4738
           1       0.53      0.48      0.50      4497

   micro avg       0.54      0.54      0.54      9235
   macro avg       0.54      0.54      0.54      9235
weighted avg       0.54      0.54      0.54      9235

Confusion Matrix   [[2792 1946]
 [2322 2175]]


In [12]:
evaluate_features(tfidf, data['loss/profit'], SVC(kernel='linear', probability=True, random_state=2019))

Log loss: 0.6894071945901016
Accuracy: 0.5356794802382241
Classification report               precision    recall  f1-score   support

           0       0.54      0.72      0.62      4738
           1       0.54      0.34      0.41      4497

   micro avg       0.54      0.54      0.54      9235
   macro avg       0.54      0.53      0.51      9235
weighted avg       0.54      0.54      0.52      9235

Confusion Matrix   [[3434 1304]
 [2984 1513]]


In [13]:
evaluate_features(tfidf, data['loss/profit'], SVC(kernel='rbf', probability=True, random_state=45,C=2,gamma=.01))

Log loss: 0.6897448911893616
Accuracy: 0.5431510557661072
Classification report               precision    recall  f1-score   support

           0       0.54      0.78      0.64      4738
           1       0.56      0.30      0.39      4497

   micro avg       0.54      0.54      0.54      9235
   macro avg       0.55      0.54      0.51      9235
weighted avg       0.55      0.54      0.51      9235

Confusion Matrix   [[3689 1049]
 [3170 1327]]


# LightGBM

In [100]:
basictf= TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),sublinear_tf=True)
basicTrain=basictf.fit_transform(X_train)
print(basicTrain.shape)

(8907, 58111)


In [101]:
import lightgbm as lgb

In [102]:
d_train = lgb.Dataset(basicTrain, label=y_train)
params = {}
params['learning_rate'] = 0.002
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 32
params['min_data'] = 50
params['max_depth'] = 20
params['max_bin']=1024
clf = lgb.train(params, d_train,1000)

In [103]:
basicTest = basictf.transform(X_test)
y_pred = clf.predict(basicTest)
y_pred

array([0.42222875, 0.48595289, 0.44494029, ..., 0.51849818, 0.41840873,
       0.48488979])

In [106]:
for i in range(0,2227):
    if y_pred[i]>=.5:       # setting threshold to .5
       y_pred[i]=1
    else:  
       y_pred[i]=0

In [107]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_pred,y_test))

[[770 366]
 [653 438]]
0.5424337674000899
