# Prediction based on the frequencies of some words
We'll analyse the frequency of 17 chosen words in tweets : 'win', 'lose', 'draw', 'goal', 'red', 'yellow', 'penalty', 'foul', 'offfside', 'corner', 'free', 'kick', 'score', 'assist', 'pass', 'tackle'


# Importing packages

In [2]:
import os
import re
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.tree import DecisionTreeClassifier

# Preprocessing tweets
I used a similar preprocessing that in the example of kaggle

In [None]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


# Preprocess the train tweets

li = []
for filename in os.listdir("train_tweets"):
    print("File :", filename)
    df = pd.read_csv("train_tweets/" + filename)
    df['Tweet'] = df['Tweet'].apply(preprocess_text)
    li.append(df)

df = pd.concat(li, ignore_index=True)

os.mkdir("train_tweets_preprocessed")
df.to_csv("train_tweets_preprocessed/all_matches_preprocessed.csv", index=False)

# Preprocess the eval tweets

li = []
for filename in os.listdir("eval_tweets"):
    print("File :", filename)
    df = pd.read_csv("eval_tweets/" + filename)
    df['Tweet'] = df['Tweet'].apply(preprocess_text)
    li.append(df)

df = pd.concat(li, ignore_index=True)

os.mkdir("eval_tweets_preprocessed")
df.to_csv("eval_tweets_preprocessed/all_matches_preprocessed.csv", index=False)


# Compute the frequencies of our chosen words
We compute the frequency : tweets containing the word / total tweets 

In [None]:
#Words to search in tweets
words = ['win', 'lose', 'draw', 'goal', 'red', 'yellow', 'penalty', 'foul', 'offfside', 'corner', 'free', 'kick', 'score', 'assist', 'pass', 'tackle']


#Compute for train tweets

df = pd.read_csv("train_tweets_preprocessed/all_matches_preprocessed.csv")

for word in words:
    # Count the occurence of the word in each tweet
    df['is_'+word+'_in_tweet'] = df['Tweet'].apply(lambda x: x.split().count(word))

#Create a columns with only ones => To count the number of tweets
df['tweet_count'] = 1

# Sum the number of tweets where the word is present and groupd by ID
specify_groupby = {'ID': 'first', 'MatchID': 'first', 'EventType': 'first', 'tweet_count': 'sum'}
for word in words:
    specify_groupby['is_'+word+'_in_tweet'] = 'sum'
df_grouped_period = df.groupby('ID').agg(specify_groupby)

#In the new DataFrame, the column 'is_word_in_tweet' contains the number of tweets where the word is present
#We add a column that is the division of the number of tweets where the word is present by the total number of tweets

for word in words:
    df_grouped_period['tweet_freq_with_'+word+'_in'] = df_grouped_period['is_'+word+'_in_tweet'] / df_grouped_period['tweet_count']

df_grouped_period.to_csv("train_tweets_preprocessed/all_frequencies_by_period.csv", index=False)


#Compute for eval tweets

df = pd.read_csv("eval_tweets_preprocessed/all_matches_preprocessed.csv")

for word in words:
    df['is_'+word+'_in_tweet'] = df['Tweet'].apply(lambda x: word in x)

df['tweet_count'] = 1

specify_groupby = {'ID': 'first', 'MatchID': 'first', 'tweet_count': 'sum'}
for word in words:
    specify_groupby['is_'+word+'_in_tweet'] = 'sum'
df_grouped_period = df.groupby('ID').agg(specify_groupby)

for word in words:
    df_grouped_period['tweet_freq_with_'+word+'_in'] = df_grouped_period['is_'+word+'_in_tweet'] / df_grouped_period['tweet_count']

df_grouped_period.to_csv("eval_tweets_preprocessed/all_frequencies_by_period.csv", index=False)

# Try prediction on frequencies
Change the max depth of the tree as you want

In [None]:
# Set the training columns
words = ['win', 'lose', 'draw', 'goal', 'red', 'yellow', 'penalty', 'foul', 'offfside', 'corner', 'free', 'kick', 'score', 'assist', 'pass', 'tackle']

train_columns = ['tweet_freq_with_'+word+'_in' for word in words]

tree_depth = 4 # CAN BE CHANGED AS THE USER WANTS

df = pd.read_csv('train_tweets_preprocessed/all_frequencies_by_period.csv')

# Obtains the cross-validation accuracy

match_ids = df['MatchID'].unique()
accuracies = []

for match in match_ids:
    df_train = df[df['MatchID'] != match].copy()    
    df_eval = df[df['MatchID'] == match].copy()

    decision_tree = DecisionTreeClassifier(max_depth=tree_depth)
    decision_tree.fit(df_train[train_columns].values, df_train['EventType'].values)

    df_eval['Prediction'] = decision_tree.predict(df_eval[train_columns].values)

    correct_predictions = (df_eval['Prediction'] == df_eval['EventType']).sum()
    total_predictions = len(df_eval)
    accuracy = correct_predictions / total_predictions
    #print(f"Correct predictions: {correct_predictions}, Total predictions: {total_predictions} => Accuracy: {accuracy}")
    accuracies.append(accuracy)

print(f"Mean accuracy: {np.mean(accuracies)}")

#Train the model on the full training set and predict on the eval set
df_train = pd.read_csv('train_tweets_preprocessed/all_frequencies_by_period.csv')
df_eval = pd.read_csv('eval_tweets_preprocessed/all_frequencies_by_period.csv')

decision_tree = DecisionTreeClassifier(max_depth=tree_depth)
decision_tree.fit(df_train[train_columns], df_train['EventType'])

df_eval['EventType'] = decision_tree.predict(df_eval[train_columns])

df_predicted = df_eval[['ID', 'EventType']].copy()

os.makedirs("eval_tweets_prediction", exist_ok=True)
df_predicted.to_csv("eval_tweets_prediction/predictions_with_word_count_(depth="+str(tree_depth)+").csv", index=False)

Mean accuracy: 0.6624448642793095


# Computing word frequency per match to normalize frequencies

In [None]:
#Words used for training
words = ['win', 'lose', 'draw', 'goal', 'red', 'yellow', 'penalty', 'foul', 'offfside', 'corner', 'free', 'kick','score', 'assist', 'pass', 'tackle']

#Compute for train tweets
df = pd.read_csv("train_tweets_preprocessed/all_frequencies_by_period.csv")

#Sum the number of tweets where the word is present and groupd by MatchID
specify_groupby = {'MatchID': 'first', 'tweet_count': 'sum'}
for word in words:
    specify_groupby['is_'+word+'_in_tweet'] = 'sum'
df_grouped_match = df.groupby('MatchID').agg(specify_groupby)

#Divide by the total number of tweets to obtain the frequency of each word per match
for word in words:
    df_grouped_match['tweet_freq_with_'+word+'_in'] = df_grouped_match['is_'+word+'_in_tweet'] / df_grouped_match['tweet_count']

df_grouped_match.to_csv("train_tweets_preprocessed/all_frequencies_by_match.csv", index=False)


#Compute for eval tweets
df = pd.read_csv("eval_tweets_preprocessed/all_frequencies_by_period.csv")

specify_groupby = {'MatchID': 'first', 'tweet_count': 'sum'}
for word in words:
    specify_groupby['is_'+word+'_in_tweet'] = 'sum'
df_grouped_match = df.groupby('MatchID').agg(specify_groupby)

for word in words:
    df_grouped_match['tweet_freq_with_'+word+'_in'] = df_grouped_match['is_'+word+'_in_tweet'] / df_grouped_match['tweet_count']

df_grouped_match.to_csv("eval_tweets_preprocessed/all_frequencies_by_match.csv", index=False)



# Standardize frequencies with previously computed match frequencies

In [None]:
#Word used for training
words = ['win', 'lose', 'draw', 'goal', 'red', 'yellow', 'penalty', 'foul', 'offfside', 'corner', 'free', 'kick', 'score', 'assist', 'pass', 'tackle']

# Function to compute the ratio between the previous frequency and the frequency of the word in the match
def freq_to_freq_ratio(value, word,  matchID):
    if value == 0:
        return 0
    return value / df_match.loc[df_match['MatchID'] == matchID, 'tweet_freq_with_'+word+'_in'].iloc[0] 


#Compute for train tweets
df_period = pd.read_csv("train_tweets_preprocessed/all_frequencies_by_period.csv")
df_match = pd.read_csv("train_tweets_preprocessed/all_frequencies_by_match.csv")

for word in words:
    df_period['tweet_freq_ratio_with_'+word+'_in'] = df_period.apply(lambda x: freq_to_freq_ratio(x['tweet_freq_with_'+word+'_in'], word, x['MatchID']), axis=1)

df_period.to_csv("train_tweets_preprocessed/all_frequencies_by_period_with_ratio.csv", index=False)

#Compute for eval tweets
df_period = pd.read_csv("eval_tweets_preprocessed/all_frequencies_by_period.csv")
df_match = pd.read_csv("eval_tweets_preprocessed/all_frequencies_by_match.csv")

for word in words:
    df_period['tweet_freq_ratio_with_'+word+'_in'] = df_period.apply(lambda x: freq_to_freq_ratio(x['tweet_freq_with_'+word+'_in'], word, x['MatchID']), axis=1)

df_period.to_csv("eval_tweets_preprocessed/all_frequencies_by_period_with_ratio.csv", index=False)



# Try prediction on standardized frequencies

In [None]:
# Set the training columns
words = ['win', 'lose', 'draw', 'goal', 'red', 'yellow', 'penalty', 'foul', 'offfside', 'corner', 'free', 'kick', 'score', 'assist', 'pass', 'tackle']

train_columns = ['tweet_freq_ratio_with_'+word+'_in' for word in words]

tree_depth = 4

df = pd.read_csv('train_tweets_preprocessed/all_frequencies_by_period_with_ratio.csv')

# Obtains the cross-validation accuracy
match_ids = df_train['MatchID'].unique()
accuracies = []

for match in match_ids:
    df_train = df[df['MatchID'] != match].copy()    
    df_eval = df[df['MatchID'] == match].copy()

    decision_tree = DecisionTreeClassifier(max_depth=tree_depth)
    decision_tree.fit(df_train[train_columns].values, df_train['EventType'].values)

    df_eval['Prediction'] = decision_tree.predict(df_eval[train_columns].values)

    correct_predictions = (df_eval['Prediction'] == df_eval['EventType']).sum()
    total_predictions = len(df_eval)
    accuracy = correct_predictions / total_predictions
    #print(f"Correct predictions: {correct_predictions}, Total predictions: {total_predictions} => Accuracy: {accuracy}")
    accuracies.append(accuracy)

print(f"Mean accuracy: {np.mean(accuracies)}")

#Train the model on the full training set and predict on the eval set
df_train = pd.read_csv('train_tweets_preprocessed/all_frequencies_by_period_with_ratio.csv')
df_eval = pd.read_csv('eval_tweets_preprocessed/all_frequencies_by_period_with_ratio.csv')

decision_tree = DecisionTreeClassifier(max_depth=tree_depth)
decision_tree.fit(df_train[train_columns], df_train['EventType'])

df_eval['Prediction'] = decision_tree.predict(df_eval[train_columns])

df_predicted = df_eval[['ID', 'Prediction']].copy()

os.makedirs("eval_tweets_prediction", exist_ok=True)
df_predicted.to_csv("eval_tweets_prediction/predictions_with_word_count_normalized_(depth="+str(tree_depth)+").csv", index=False)

Mean accuracy: 0.6316739003954741
