In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer
import re
import os

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/shash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shash/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Read all training files and concatenate them into one dataframe
train_csv_path = "/Users/shash/c_ml/kaggle_ml/challenge_data/train_tweets/"
li = []
for filename in os.listdir(train_csv_path):
    df = pd.read_csv(train_csv_path + filename)
    li.append(df)

train_df = pd.concat(li, ignore_index=True)
print(train_df.shape)

(5056050, 6)


In [None]:
# Read all training files and concatenate them into one dataframe
test_csv_path = "/Users/shash/c_ml/kaggle_ml/challenge_data/eval_tweets/"
li = []
for filename in os.listdir(test_csv_path):
    df = pd.read_csv(test_csv_path + filename)
    li.append(df)

test_df = pd.concat(li, ignore_index=True)
print(test_df.shape)

(1072928, 5)


In [7]:
def preprocessing(df):
    # Remove retweets
    df = df[df['Tweet'].str.findall(r"RT @[\w]+:").map(len) == 0]

    # Remove mentions
    df = df[df['Tweet'].str.findall(r"@[\w]+").map(len) == 0]
    
    df = df[df['PeriodID'] <= 129]

    teams = [
        "Argentina", "Belgium", "Germany", "Australia", "Netherlands", "Spain", "South Korea",
        "Cameroon", "Brazil", "France", "Nigeria", "Algeria", "USA", "Honduras", "Switzerland",
        "Mexico", "Croatia", "Chile", "Portugal", "Ghana", "Slovenia", "Serbia", "Greece",
        "Ivory Coast",         "ARG", "BEL", "GER", "AUS", "NED", "ESP", "KOR", "CMR", "BRA", "FRA", "NGA", "ALG",
        "USA", "HON", "SUI", "MEX", "CRO", "CHL", "POR", "GHA", "SVN", "SRB", "GRE", "CIV"
    ]
    
    team_regex = r'\b(?:' + '|'.join(re.escape(team) for team in teams) + r')\b'
    df['Tweet'] = df['Tweet'].str.replace(team_regex, '', case=False, regex=True)

    # Remove URLs
    df['Tweet'] = df['Tweet'].str.replace(r"http\S+", "", regex=True)

    # Remove punctuation and special characters (including numbers)
    df['Tweet'] = df['Tweet'].str.replace(r"[^a-zA-Z\s]+", " ", regex=True)

    # Lowercase all remaining tweets
    df['Tweet'] = df['Tweet'].str.lower().astype(str)

    # stopwords
    stop_words = set(nltk_stopwords.words("english"))
    df['Tweet'] = df['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    df['Tweet'] = df['Tweet'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    
    # Remove duplicated tweets
    df.drop_duplicates(inplace=True)

    return df

In [8]:
train_df = preprocessing(train_df)
print(train_df.shape)

test_df = preprocessing(test_df)
print(test_df.shape)

test_df.to_pickle('final_data/test_df.pkl')
train_df.to_pickle('final_data/train_df.pkl')

print("Data preprocessing done!")

(1778619, 6)
(443811, 5)
Data preprocessing done!
