In [None]:
# Dataframe
import pandas as pd

#nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#Regular Expressions
import re

# Maptplolib
import matplotlib.pyplot as plt 

#Numpy
import numpy as np 

# Seaborn
import seaborn as sns 

# Scikit-learn
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Read Dataset

# Dataset Details
# messages : the text of news
# labels   : type of news(neutral,positive,negative)
# target   : the polarity of the news (0 = negative, 2 = neutral, 4 = positive)

In [None]:
news_df = pd.read_csv('../input/sentiment-analysis-for-financial-news/all-data.csv',names = ['lables','messages'])
news_df.head()

In [None]:
# Check for nulll values
news_df.isnull().sum()

In [None]:
# Shape of our dataset
news_df.shape

In [None]:
df_copy = news_df.copy()
df_copy.head()

In [None]:
# Map target label to String
# 0 -> NEGATIVE
# 2 -> NEUTRAL
# 4 -> POSITIVE

def func(df):
    if df == 'neutral':
        return 0
    elif df == 'negative':
        return 1
    else:
        return 2
df_copy['target'] = df_copy.lables.apply(func)
df_copy.head()

In [None]:
# Count plot for labels feature
plt.figure(figsize=(9,7))
df_copy.lables.value_counts().plot(kind='bar',color=['deepskyblue','lime','red'])
plt.xlabel("News Type")
plt.ylabel("count")
plt.title("Count Plot for labels",{'fontsize':20});

In [None]:
# Text preprocessing
corpous = list()

#Object for porterstem
ps = PorterStemmer()

for i in range(len(df_copy)):
    text = re.sub(r'[^a-zA-Z]',' ',df_copy.messages[i])
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if word not in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpous.append(text)


In [None]:
# We used CountVectorizer to transform a given text into a vector on the basis of the frequency of each word that occurs in the entire text.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
X = cv.fit_transform(corpous).toarray()

In [None]:
# Dependent Feature
y = df_copy.target

In [None]:
# This is one of the most important because this is where we apply the algorithms, to do this we have to feed the trained the tested values to the actual algorithm, by doing so we can predict the subscribers. To do this follow the below code:

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=43)

In [None]:
# List of feature names 

In [None]:
cv.get_feature_names()[:10]

In [None]:
# Build Models

models = {
    MultinomialNB():'Multinomial Naive Bayes',
    LogisticRegression(max_iter=300):'Logistic Regression',
    SVC(C=1.0, kernel='linear', degree=3, gamma='auto'):"Support Vector Machine"
}
for m in models.keys():
    m.fit(X_train,y_train)

In [None]:
# Check Accuracy for each model
for model,name in models.items():
     print(f"Accuracy Score for {name} is : ",model.score(X_test,y_test)*100,"%")

In [None]:
# Plot Heatmaps for all models

class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)


for model,name in models.items():
    y_pred = model.predict(X_test)
    cnf_matrix = confusion_matrix(y_test,y_pred)
    sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'PuOr',
            fmt = 'g')
    ax.xaxis.set_label_position('top')
    plt.tight_layout()
    plt.title(f'Heat Map for {name}', {'fontsize':20})
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
# Classification Report
for model,name in models.items():
    y_pred = model.predict(X_test)
    print(f"Classification Report for {name}")  
    print("----------------------------------------------------------")
    print(classification_report(y_test,y_pred))
    print("----------------------------------------------------------")