# Library

In [3]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import textstat
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from nltk.probability import FreqDist

In [42]:
df=pd.read_csv("data.csv")
df = df.drop('tweet_date', axis=1)

dfDate=pd.read_csv('dataAppleDate.csv')

df = pd.merge(df, dfDate, on='id')
df.drop_duplicates(subset="id", keep=False, inplace=True)


In [9]:
# Load the spaCy model for NLP processing
nlp = spacy.load("en_core_web_sm")

def count_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return len(entities)

def sentiment_score(text):
    doc = nlp(text)
    sentiment = doc.sentiment
    return sentiment.polarity

def average_word_length(text):
    words = text.split()
    total_word_length = 0
    for word in words:
        total_word_length += len(word)
    average_word_length = total_word_length / len(words)
    return average_word_length

def readability_score(text):
    return textstat.flesch_reading_ease(text)
    
def get_time_period(hour):
    if hour >= 0 and hour < 6:
        return "midnight to 6 AM"
    elif hour >= 6 and hour < 12:
        return "6 AM to 12 PM"
    elif hour >= 12 and hour < 18:
        return "12 PM to 6 PM"
    else:
        return "6 PM to midnight"

# function to extract the most common word from a tweet
def extract_common_word(tweet):
    words = nltk.word_tokenize(tweet)
    words = [word.lower() for word in words]
    stop_words = set(nltk.corpus.stopwords.words("english"))
    words = [word for word in words if word.isalpha() and word not in stop_words]
    fdist = FreqDist(words)
    common_word = fdist.most_common(1)[0][0]
    score = fdist.most_common(1)[0][1]
    return common_word, score

# Preprocessing

In [44]:
# Create an instance of LabelEncoder
le = LabelEncoder()

# Add a feature for text length
df['text_length'] = df['text'].apply(len)
df['average_word_length'] = df['text'].apply(average_word_length)

# Add a feature for arousal level
sia = SentimentIntensityAnalyzer()
df['arousal'] = df['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
# df['sentiment_score'] = df['text'].apply(sentiment_score)
df['entities_count'] = df['text'].apply(count_entities)
df['readability_score'] = df['text'].apply(readability_score)

# Convert the "tweet_date" column to a datetime format
df["tweet_date"] = pd.to_datetime(df["tweet_date"])
df["time_period"] = df["tweet_date"].dt.hour.apply(get_time_period)

df['sentiment'] = le.fit_transform(df['sentiment'])
df['tag'] = le.fit_transform(df['tag'])
df['tweet_date'] = le.fit_transform(df['tweet_date'])
df['time_period'] = le.fit_transform(df['time_period'])

# Preparing Data

In [45]:
df_Tweet_Metrics=df[['id','like_count','retweet_count','quote_count']]

# Data that can be collected to complete the model in the future
# df_Tweet_Metrics=df[['id','like_count','retweet_count','quote_count','reply_count','impression_count','metrics24h','metrics7day']]

# df_Tweet_Text=df[['id','author_id','tag','sentiment','arousal','average_word_length','text_length','entities_count','readability_score','time_period','tweet_date','jaccard']]
df_Tweet_Text=df[['id','sentiment','time_period','tweet_date','jaccard','average_word_length','text_length']]
# Data that can be collected to complete the model in the future
# df_Tweet_Text=df[['text','clean_text','sentiment','arousal','average_word_length','text_length','readability_score','time_period','tweet_date','geo','possibly_sensitive','reply_settings','context','jaccard']]

df_User=df[['id','followers_count','following_count']]
# Data that can be collected to complete the model in the future
# df_User=df[['followers_count','following_count','metrics_24h','metrics_7day','SST','LSM','verified','location','description','sentiment_description','verified_type','tweet_count','fakeAccountRate']]


# Virality :

In [None]:
# virality_metrics = (like_count + retweet_count + quote_count) * sentiment_score

# virality_evolution = delta (metrics_T0, metrics_24H, metrics7DAY)

# virality_interaction = (tweet_metrics) * sentiment_score + (reply_metrics,quote_metrics) * sentiment_score 

# virality_global = virality_metrics + virality_evolution + virality_interaction

# Model prediction

In [46]:
df2 = pd.merge(df_User, df_Tweet_Text, on='id')
df2 = df2.drop('id', axis=1)
df_Tweet_Metrics = df_Tweet_Metrics.drop('id', axis=1)

# Split the data into training and test sets
train_data, test_data, train_target, test_target = train_test_split(df2, df_Tweet_Metrics, test_size=0.2)

# Fit a linear regression model to the training data
regressor = LinearRegression()
regressor.fit(train_data, train_target)

# Use the model to make predictions on the test data
predictions = regressor.predict(test_data)

# Evaluate the model's performance by comparing the predictions to the actual target values
error = test_target - predictions

# Model Evaluation

In [47]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate the mean absolute error (MAE)
# MAE measures the average magnitude of the errors in a set of predictions
# A lower MAE indicates a better fit
mae = mean_absolute_error(test_target, predictions)
print("Mean Absolute Error (MAE):", mae)

# Calculate the mean squared error (MSE)
# MSE measures the average of the square of the errors, which gives more weight to larger errors
# A lower MSE indicates a better fit
mse = mean_squared_error(test_target, predictions)
print("Mean Squared Error (MSE):", mse)

# Calculate the root mean squared error (RMSE)
# RMSE is the square root of the MSE, and is used to provide a more interpretable result
# A lower RMSE indicates a better fit
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate R-squared
# R-squared measures the proportion of the variance in the target variable that is explained by the model
# A higher R-squared value indicates a better fit
r2 = r2_score(test_target, predictions)
print("R-Squared (R2):", r2)

# Calculate the mean absolute percentage error (MAPE)
# MAPE measures the average percentage error of the predictions
# A lower MAPE indicates a better fit
mape = np.mean(np.abs((test_target - predictions) / test_target)) * 100
print("Mean Absolute Percentage Error (MAPE):", mape, "%")


Mean Absolute Error (MAE): 2291.869359565716
Mean Squared Error (MSE): 150629777.7915829
Root Mean Squared Error (RMSE): 12273.132354520703
R-Squared (R2): 0.009051736237079297
Mean Absolute Percentage Error (MAPE): like_count              inf
retweet_count    312.842376
quote_count      364.695997
dtype: float64 %


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
