# Predict the News Sentiment using transformers

In [None]:
#Loading and installing libraries
!pip install simpletransformers
import os
import pandas as pd
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split

###  Reading the data using pandas

In [None]:
df = pd.read_csv("../input/news-popularity-in-multiple-social-media-platforms/train_file.csv")
df.info()

There are 175 (<0.1%) null values in "Source"

In [None]:
#checking for duplicates
df[df.duplicated()]

The above commands confirms that there are no duplicate rows

In [None]:
df.head(4)

### Seperating text data for analysis

In [None]:
df_Text = df[["Title","Headline","Source","Topic"]].copy()

In [None]:
#Creating word cloud (based on title and headline) for each of the topic
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
def wordcloud_draw(data, color = 'black'):
    for i,j in enumerate(df_Text.Topic.value_counts().index.tolist()):
        words = ' '.join(data[df_Text.Topic==j])
        cleaned_word = " ".join([word for word in words.split()
                                if 'obama' not in word.lower()
                                    and 'economy' not in word.lower() 
                                    and 'microsoft' not in word.lower()
                                    and 'palestine' not in word.lower()
                                    and not word.startswith('@')
                                    and not word.startswith('#')
                                    
                                ])
        wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color=color,
                          width=2500,
                          height=2000,
                          max_words=200
                        ).generate(cleaned_word)
        plt.figure(1,figsize=(20, 20))
        plt.subplot(2,2,i+1)
        plt.imshow(wordcloud)
        plt.title(j)
        plt.axis('off')
    plt.show()
    
print("Word cloud of Title")
wordcloud_draw(df_Text.Title,'white')

In [None]:
print("Word cloud of Headline ")
wordcloud_draw(df_Text.Headline,'white')

In [None]:
import matplotlib.pyplot as plt
df_Text['Topic'].value_counts().plot.bar()

Among the news items least were palestine and highest were economy related



In [None]:
df[df.apply(lambda x: x.Topic.lower() not in x.Headline.lower(), axis=1)][['Topic', 'Headline']]

The above analysis shows that ~ 20% of news items dont have topics in their headline



In [None]:
df[df.apply(lambda x: x.Topic.lower() not in x.Title.lower(), axis=1)][['Topic', 'Title']]

The above analysis shows that ~ 15% of news items dont have topics in their Title


### Seperating non text data for analysis

In [None]:
df_notText=df[["Facebook","GooglePlus","LinkedIn","SentimentTitle","SentimentHeadline"]].copy()

In [None]:
df['SentimentHeadline'].hist()

In [None]:
df['SentimentTitle'].hist()

In [None]:
#Statistical summary of the dataset
df_notText.describe()


* The above results and histogram shows that most of the data has neutral comments ,this is confirmed by the SentimentTitle and SentimentHeadline column as the 25 and 75 percentile are around the neutral value i.e. near to zero.
* Also Facebook,GooglePlus,LinkedIn ,  -ve or low-value means, the news was not so engaging and interesting and didn’t reach out to many people in that particular platform as confirmed by the 75% precentile of the data from the three columns are closer to 0 .
* Facebook has the higher reach as compared to GooglePlus and Linkedin.








In [None]:
# Bar graph depicting total sentiment for the different topics

df.groupby('Topic').agg('sum')[['SentimentHeadline', 'SentimentTitle']].plot(kind='bar', figsize=(25, 7),
                                                          stacked=False, color=['b', 'r', 'g']);

After reading the news Headline, sentiment increased towards negative side.

In [None]:
import numpy as np
df_notText['SentimentChange']=np.where((df['SentimentTitle'] <= 0) & (df['SentimentHeadline'] >=0)
                     ,'Title -ve Headline +ve' , 'Title +ve Headline -ve')

In [None]:
import seaborn as sns
sns.countplot(df_notText.SentimentChange,hue=df.Topic)


In [None]:
#Correlation Matrix

plt.figure(figsize=(10,10))
_ = sns.heatmap(df_notText[['Facebook','GooglePlus','LinkedIn','SentimentTitle','SentimentHeadline']].corr(), square=True, cmap='Blues',linewidths=0.5,linecolor='w',annot=True)
plt.title('Correlation matrix ')

plt.show()

This shows that Facebook,GooglePlus,Linkedin doesn’t have any correlation with the target variable SentimentTitle and SentimentHeadline


# Model Training

In [None]:
df_title = df[["Title","SentimentTitle"]].copy()
df_title.columns=["Title", "labels"]
train_df_title, test_df_title = train_test_split(df_title, test_size=0.2, random_state=42)
train_df_title, eval_df_title = train_test_split(train_df_title, test_size=0.1, random_state=42)
train_df_title.reset_index(drop=True,inplace=True)
train_df_title.head()

In [None]:
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "regression": True,
    "num_train_epochs":3,
}

# Create a TransformerModel
model1 = ClassificationModel("distilbert", "distilbert-base-cased", num_labels=1, args=train_args)

# Train the model
model1.train_model(train_df_title,eval_df=eval_df_title)
## To use a already save model we will use model = ClassificationModel("distilbert", "path_to_model", num_labels=1, args=train_args)
# Evaluate the model
result1, model_outputs1, wrong_predictions1 = model1.eval_model(eval_df=eval_df_title)
print(result1)

In [None]:
print(result1)

In [None]:
test_df = pd.read_csv("../input/news-popularity-in-multiple-social-media-platforms/test_file.csv")
test_df.head()

In [None]:
preds1, out1 = model1.predict(test_df['Title'].to_list())
print(preds1)

In [None]:
test_df["SentimentTitle"] = preds1

In [None]:
test_df.head()

In [None]:
df_Head = df[["Headline","SentimentHeadline"]].copy()
df_Head.columns=["Headline", "labels"]
train_df_head, test_df_head = train_test_split(df_Head, test_size=0.2, random_state=42)
train_df_head, eval_df_head = train_test_split(train_df_head, test_size=0.1, random_state=42)
train_df_head.reset_index(drop=True,inplace=True)

In [None]:
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "regression": True,
    "num_train_epochs":3,
}
# Create a TransformerModel
model2 = ClassificationModel("distilbert", "distilbert-base-cased", num_labels=1, args=train_args)
# Train the model
model2.train_model(train_df_head,eval_df=eval_df_head)
#Evaluate the model
result2, model_outputs2, wrong_predictions2 = model2.eval_model(eval_df=eval_df_head)
print(result2)

In [None]:
preds2, out2 = model2.predict(test_df['Headline'].to_list())
print(preds2)

In [None]:
test_df["SentimentHeadline"] = preds2

## Mae calculation for both title and headline sentiments

In [None]:
preds3, out3 = model1.predict(test_df_title.Title.to_list())
preds4, out4 = model2.predict(test_df_head.Headline.to_list())


In [None]:

from sklearn.metrics import mean_absolute_error
mae_title=mean_absolute_error(test_df_title.labels.to_list(),preds3)
mae_headline=mean_absolute_error(test_df_head.labels.to_list(),preds4)

In [None]:
mae_title,mae_headline

In [None]:
score=1-((0.4*mae_title)+(0.6*mae_headline))

In [None]:
print("Score = {} \nScore(out of 100%) = {}%".format(score,round(score*100, 2)))

Submission Result

In [None]:
submission = test_df[["IDLink","SentimentTitle","SentimentHeadline"]]
submission.to_csv("sample_submission.csv",index=False)

In [None]:
submission.head(10)