# Sentiment Analysis on Financial News Dataset.

## 1a. Load Required Libraries 

In [94]:
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [95]:
!pip install nltk
import nltk



## 1b. Load the dataset into an appropriate dataframe

In [96]:
df = pd.read_csv(r'/Users/priyankat/Downloads/all-data.csv', encoding='latin1', header=None)

## 1c. Understanding The Data

In [97]:
df.shape

(4846, 2)

In [98]:
df.head()

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


## 1d. Defining Column Names

In [99]:
df.columns=["Sentiment","News"]
df.head()

Unnamed: 0,Sentiment,News
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


## 1e.Defining x and y variables

In [100]:
text_train, y_train = df.News, df.Sentiment

## 2a. Remove unwanted characters

In [101]:
import re #importing regular expression
def remove_unwanted_chars(text):
    return re.sub(r'[^A-Za-z\s]','',text) #re.sub(pattern, replacement, text)
#[note: ^ means not these and \s(small s) means Whitespace character (spaces, tabs, newlines))
text_train=text_train.apply(remove_unwanted_chars)

In [102]:
text_train.head()

0    According to Gran  the company has no plans to...
1    Technopolis plans to develop in stages an area...
2    The international electronic industry company ...
3    With the new production plant the company woul...
4    According to the company s updated strategy fo...
Name: News, dtype: object

## 2b. Remove URLs

In [103]:
def remove_url(text):
    return re.sub(r'http\S+|www\.\S+','',text) #| means OR in regex \S+ (capital S) means one or more non-whitespace characters 
#www\. matches "www." exactly (. is escaped because dot in regex means "any character")
text_train=text_train.apply(remove_url)
text_train.head()

0    According to Gran  the company has no plans to...
1    Technopolis plans to develop in stages an area...
2    The international electronic industry company ...
3    With the new production plant the company woul...
4    According to the company s updated strategy fo...
Name: News, dtype: object

## 2c. Remove HTMLs

In [104]:
def remove_html_tags(text):
    # This regex finds anything between < and >, and removes it
    return re.sub(r'<.*?>', '', text)
text_train=text_train.apply(remove_html_tags)
text_train.head()

0    According to Gran  the company has no plans to...
1    Technopolis plans to develop in stages an area...
2    The international electronic industry company ...
3    With the new production plant the company woul...
4    According to the company s updated strategy fo...
Name: News, dtype: object

## 2d. Lowercase the text

In [105]:
text_train = text_train.apply(lambda x: x.lower())
text_train.head()

0    according to gran  the company has no plans to...
1    technopolis plans to develop in stages an area...
2    the international electronic industry company ...
3    with the new production plant the company woul...
4    according to the company s updated strategy fo...
Name: News, dtype: object

## 2e. Stopwords

In [106]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/priyankat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [107]:
print(stop_words)

{'y', 'with', "that'll", 'didn', 'this', 'no', 'our', 'when', "he'd", 'about', 'can', 'until', "shan't", 'my', 'those', 'ma', 'but', 'a', "i've", 'its', 'both', 'the', 'during', "doesn't", 'too', 'for', "mightn't", 'be', "weren't", 'off', 'again', "aren't", 'now', 'while', "should've", "we'll", 'of', 'further', 'he', 'do', 'itself', 'she', 'aren', 'did', "i'll", 'will', 'wouldn', 'through', 'it', 's', "they've", "wouldn't", 'why', 'not', 'or', 'above', 'same', 'and', 'being', 'how', "wasn't", 'an', 'on', 'because', 'shouldn', 'as', 'very', "we're", 'from', 'hasn', 'over', 'we', 'himself', 'just', 'so', 'had', 'haven', "don't", "hadn't", 'after', 'whom', "you're", 'herself', 'doing', 'ourselves', 'your', "hasn't", 'hadn', 'shan', "won't", 're', 'couldn', 'if', 'few', 'into', "didn't", 'mightn', 'other', "it'd", 'o', 'these', 'most', "we've", 'yourself', 'me', 'then', 'up', 'out', 'has', "isn't", "she'll", "it's", 'they', 'been', 'i', 'down', 'm', 'won', "i'd", 'more', 'any', 'them', 'ha

In [108]:
retain_words = {'no', 'not', 'low', 'up', 'few', 'some', "haven't", 'has',"didn't", 'against', 'would', 'should', 
                "shouldn't",'did','same',"doesn't",'won'}
custom_stopwords = stop_words - retain_words
def remove_stopwords(text):
    words = text.split()
    return ' '.join([word for word in words if word not in custom_stopwords])
text_train=text_train.apply(remove_stopwords)

In [109]:
text_train.head()

0    according gran company has no plans move produ...
1    technopolis plans develop stages area no less ...
2    international electronic industry company elco...
3    new production plant company would increase ca...
4    according company updated strategy years baswa...
Name: News, dtype: object

## 2f. Finding Frequently Occuring Words

In [110]:
from collections import Counter

cnt = Counter()

def get_frequent_words(text):
    for word in text.split():
        cnt[word] += 1
        

In [111]:
for text in text_train:
    get_frequent_words(text)
    
cnt.most_common()

[('eur', 1310),
 ('company', 848),
 ('mn', 593),
 ('has', 579),
 ('said', 544),
 ('finnish', 512),
 ('sales', 453),
 ('million', 441),
 ('net', 412),
 ('profit', 409),
 ('finland', 337),
 ('group', 320),
 ('operating', 299),
 ('mln', 290),
 ('year', 282),
 ('new', 267),
 ('business', 265),
 ('period', 264),
 ('oyj', 241),
 ('quarter', 238),
 ('share', 237),
 ('also', 224),
 ('services', 223),
 ('market', 217),
 ('shares', 198),
 ('first', 193),
 ('up', 182),
 ('euro', 180),
 ('helsinki', 163),
 ('loss', 153),
 ('compared', 149),
 ('today', 149),
 ('operations', 149),
 ('contract', 142),
 ('nokia', 139),
 ('total', 137),
 ('financial', 134),
 ('mobile', 134),
 ('percent', 131),
 ('production', 130),
 ('products', 130),
 ('per', 129),
 ('corporation', 129),
 ('bank', 127),
 ('according', 123),
 ('companies', 122),
 ('hel', 121),
 ('technology', 120),
 ('corresponding', 119),
 ('plant', 118),
 ('solutions', 117),
 ('service', 116),
 ('increased', 109),
 ('construction', 109),
 ('capital',

## 2g. Creating Customised Stopwords

In [112]:
custom_stopwords = set(['eur', 'company', 'mn', 'said', 'million', 'finland', 'group', 'mln', 'year', 'new', 'business', 'period', 
    'oyj',  'e', 'market','hel','usd','via','also','per'])

In [113]:
def remove_stopwords(text):
    words = text.split()
    return ' '.join([word for word in words if word not in custom_stopwords])
text_train=text_train.apply(remove_stopwords)

## 2h. Lemmetizer

In [114]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

text_train=text_train.apply(lemmatize_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/priyankat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2i. Spellcheck

In [115]:
!pip install pyspellchecker



In [116]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(text):
    corrected_words = []
    for word in text.split():
        corrected_word = spell.correction(word)
        # If None, keep the original word
        if corrected_word is None:
            corrected_words.append(word)
        else:
            corrected_words.append(corrected_word)
    return ' '.join(corrected_words)

text_train = text_train.apply(correct_spelling)

## 3. Splitting Data into Test and Train

In [117]:
X_train, X_test, y_train, y_test = train_test_split(text_train, y_train, test_size=0.2, random_state=42)

## 4. Bag-of-Words Vectorization

In [118]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

## 5. TF-IDF Vectorization

In [119]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## 6a. Random Forest Model on BOW

In [120]:
rf_bow = RandomForestClassifier(random_state=42)
rf_bow.fit(X_train_bow, y_train)
y_pred_bow = rf_bow.predict(X_test_bow)
print("BOW Classification Report:")
print(classification_report(y_test, y_pred_bow))

BOW Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.50      0.59       110
     neutral       0.75      0.94      0.84       571
    positive       0.83      0.51      0.63       289

    accuracy                           0.76       970
   macro avg       0.77      0.65      0.69       970
weighted avg       0.77      0.76      0.75       970



## 6b. Random Forest Model on TF-IDF

In [121]:
rf_tfidf = RandomForestClassifier(random_state=42)
rf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = rf_tfidf.predict(X_test_tfidf)
print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))

TF-IDF Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.47      0.58       110
     neutral       0.75      0.94      0.83       571
    positive       0.80      0.50      0.61       289

    accuracy                           0.76       970
   macro avg       0.77      0.64      0.68       970
weighted avg       0.76      0.76      0.74       970

