In [12]:
import pandas as pd
import numpy as np

In [13]:
train_path = 'final_data/train_df.pkl'
train_df = pd.read_pickle(train_path)

test_path = 'final_data/test_df.pkl'
test_df = pd.read_pickle(test_path)

train_df.shape, test_df.shape

((1778619, 6), (443811, 5))

In [14]:
import pandas as pd
import numpy as np

def create_time_series_features(train_df, test_df):
    # Combine train and test data
    train_df['is_train'] = 1
    test_df['is_train'] = 0
    combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    
    # Create features grouped by MatchID and PeriodID
    features = []
    
    # 1. Tweet count features
    tweet_counts = combined_df.groupby(['MatchID', 'PeriodID', 'ID']).size().reset_index(name='tweet_count')
    
    # 2. Rolling statistics for tweet counts
    tweet_counts['rolling_mean_tweets'] = tweet_counts.groupby('MatchID')['tweet_count'].transform(
        lambda x: x.rolling(window=3, min_periods=1).mean()
    )
    tweet_counts['rolling_std_tweets'] = tweet_counts.groupby('MatchID')['tweet_count'].transform(
        lambda x: x.rolling(window=3, min_periods=1).std()
    )
    
    # 3. Relative tweet frequency within match
    match_totals = tweet_counts.groupby('MatchID')['tweet_count'].transform('sum')
    tweet_counts['relative_frequency'] = tweet_counts['tweet_count'] / match_totals
    
    # 4. Momentum features (change from previous period)
    tweet_counts['tweet_momentum'] = tweet_counts.groupby('MatchID')['tweet_count'].transform(
        lambda x: x.pct_change().fillna(0)
    )
    
    # 5. Percentile rank within match
    tweet_counts['period_percentile'] = tweet_counts.groupby('MatchID')['tweet_count'].transform(
        lambda x: x.rank(pct=True)
    )
    
    # Merge features back with IDs
    final_features = tweet_counts[['ID', 'MatchID', 'PeriodID', 'tweet_count', 'rolling_mean_tweets',
                                 'rolling_std_tweets', 'relative_frequency', 'tweet_momentum',
                                 'period_percentile']]
    
    # Split back into train and test
    train_features = final_features.merge(
        train_df[['MatchID', 'PeriodID', 'EventType']].drop_duplicates(),
        on=['MatchID', 'PeriodID'],
        how='inner'
    )
    
    test_features = final_features.merge(
        test_df[['MatchID', 'PeriodID']].drop_duplicates(),
        on=['MatchID', 'PeriodID'],
        how='inner'
    )
    
    return train_features, test_features


In [15]:
processed_train, processed_test = create_time_series_features(train_df, test_df)

processed_train.fillna(0, inplace=True)
processed_test.fillna(0, inplace=True)

processed_train.to_csv('final_features/time_series/train_time_features.csv', index=False)
processed_test.to_csv('final_features/time_series/test_time_features.csv', index=False)

In [16]:
processed_test

Unnamed: 0,ID,MatchID,PeriodID,tweet_count,rolling_mean_tweets,rolling_std_tweets,relative_frequency,tweet_momentum,period_percentile
0,6_0,6,0,361,361.000000,0.000000,0.003867,0.000000,0.323077
1,6_1,6,1,382,371.500000,14.849242,0.004092,0.058172,0.357692
2,6_2,6,2,384,375.666667,12.741010,0.004113,0.005236,0.369231
3,6_3,6,3,480,415.333333,56.011903,0.005141,0.250000,0.484615
4,6_4,6,4,653,505.666667,136.324368,0.006994,0.360417,0.615385
...,...,...,...,...,...,...,...,...,...
511,16_125,16,125,413,490.333333,78.008547,0.012307,-0.155419,0.884615
512,16_126,16,126,385,429.000000,53.814496,0.011473,-0.067797,0.869231
513,16_127,16,127,373,390.333333,20.526406,0.011115,-0.031169,0.853846
514,16_128,16,128,333,363.666667,27.227437,0.009923,-0.107239,0.819231
