In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import dask.dataframe as dd

# Load data using Dask, specifying data types
train = dd.read_csv('train.csv')
test = dd.read_csv('test.csv')
users = dd.read_csv('users.csv', dtype={'birthyear': 'object', 'timezone': 'float64'})
user_friends = dd.read_csv('user_friends.csv.gz')
events = dd.read_csv('events.csv.gz', dtype={'city': 'object', 'country': 'object', 'state': 'object', 'zip': 'object'})
event_attendees = dd.read_csv('event_attendees.csv.gz')

# Preview the data
print("Train.csv:")
print(train.head(), "\n")

print("Test.csv:")
print(test.head(), "\n")

print("Users.csv:")
print(users.head(), "\n")

print("User_friends.csv:")
print(user_friends.head(), "\n")

print("Events.csv:")
print(events.head(), "\n")

print("Event_attendees.csv:")
print(event_attendees.head(), "\n")

# Data preprocessing
train['timestamp'] = dd.to_datetime(train['timestamp'], errors='coerce')
test['timestamp'] = dd.to_datetime(test['timestamp'], errors='coerce')
users['joinedAt'] = dd.to_datetime(users['joinedAt'], errors='coerce')
events['start_time'] = dd.to_datetime(events['start_time'], errors='coerce')

# Convert Dask data to Pandas DataFrame for EDA and further analysis
train_pd = train.compute()
test_pd = test.compute()
users_pd = users.compute()
user_friends_pd = user_friends.compute()
events_pd = events.compute()
event_attendees_pd = event_attendees.compute()

# EDA Analysis
print("Train Data Info")
print(train_pd.info())

print("\nTest Data Info")
print(test_pd.info())

print("\nUsers Data Info")
print(users_pd.info())

print("\nUser Friends Data Info")
print(user_friends_pd.info())

print("\nEvents Data Info")
print(events_pd.info())

print("\nEvent Attendees Data Info")
print(event_attendees_pd.info())

# Basic statistics
print("\nTrain Data Statistics")
print(train_pd.describe())

print("\nTest Data Statistics")
print(test_pd.describe())

print("\nUsers Data Statistics")
print(users_pd.describe())

print("\nEvents Data Statistics")
print(events_pd.describe())

# Visualizing missing values
def plot_missing_values(df, title):
    missing = df.isnull().mean()
    missing = missing[missing > 0]
    missing.sort_values(inplace=True)
    missing.plot.bar()
    plt.title(title)
    plt.show()

plot_missing_values(train_pd, "Missing Values in Train Data")
plot_missing_values(test_pd, "Missing Values in Test Data")
plot_missing_values(users_pd, "Missing Values in Users Data")
plot_missing_values(events_pd, "Missing Values in Events Data")

# Distribution of interested and not_interested in train data
train_pd['interested'].value_counts().plot(kind='bar', title='Distribution of Interested and Not Interested')
plt.show()

# Creating user and event mappings
user_ids = users_pd['user_id'].unique().tolist()
event_ids = train_pd['event'].unique().tolist()

user_to_index = {x: i for i, x in enumerate(user_ids)}
event_to_index = {x: i for i, x in enumerate(event_ids)}

train_pd['user'] = train_pd['user'].map(user_to_index)
train_pd['event'] = train_pd['event'].map(event_to_index)
test_pd['user'] = test_pd['user'].map(user_to_index)
test_pd['event'] = test_pd['event'].map(event_to_index)

# Checking for valid users and events in the test set
test_users = test_pd['user'].unique()
test_events = test_pd['event'].unique()

valid_users = np.isin(test_users, list(user_to_index.values()))
valid_events = np.isin(test_events, list(event_to_index.values()))

print(f"Number of users in the test set: {len(test_users)}")
print(f"Number of events in the test set: {len(test_events)}")
print(f"Number of users in the test set also in the training set: {np.sum(valid_users)}")
print(f"Number of events in the test set also in the training set: {np.sum(valid_events)}")

# Filter the test data
test_pd = test_pd[test_pd['user'].isin(user_to_index.values()) & test_pd['event'].isin(event_to_index.values())]

# Check that the test set is not empty after filtering
if test_pd.empty:
    print("The test set is empty after filtering. Ensure data is correct.")
else:
    num_users = len(user_ids)
    num_events = len(event_ids)

    # Prepare data for the model
    X = train_pd[['user', 'event']].values
    y = train_pd['interested'].values

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create matrix factorization model
    class RecommenderNet(tf.keras.Model):
        def __init__(self, num_users, num_events, embedding_size=50, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.user_embedding = tf.keras.layers.Embedding(
                num_users, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.event_embedding = tf.keras.layers.Embedding(
                num_events, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.user_bias = tf.keras.layers.Embedding(num_users, 1)
            self.event_bias = tf.keras.layers.Embedding(num_events, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            event_vector = self.event_embedding(inputs[:, 1])
            
            user_bias = self.user_bias(inputs[:, 0])
            event_bias = self.event_bias(inputs[:, 1])
            
            dot_user_event = tf.tensordot(user_vector, event_vector, 2)
            
            x = dot_user_event + user_bias + event_bias
            
            return tf.nn.sigmoid(x)

    # Model parameters
    embedding_size = 50

    model = RecommenderNet(num_users, num_events, embedding_size)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[tf.keras.metrics.AUC()]
    )

    # Train the model
    history = model.fit(
        x=X_train,
        y=y_train,
        batch_size=64,
        epochs=10,
        validation_data=(X_val, y_val)
    )

    # Function for making recommendations
    def recommend(user_id, model, num_recommendations=10):
        if user_id not in user_to_index:
            print(f"User ID {user_id} not found.")
            return []
            
        user_index = user_to_index[user_id]
        event_indices = np.arange(num_events)
        
        user_array = np.array([user_index] * num_events)
        event_array = event_indices
        
        predictions = model.predict(np.vstack([user_array, event_array]).T).flatten()
        
        top_indices = predictions.argsort()[-num_recommendations:][::-1]
        recommended_events = [event_ids[i] for i in top_indices]
        
        return recommended_events

    # Create the submission file
    submission = []

    for user_id in test_pd['user'].unique():
        user_id = int(user_id)  # Convert to integer
        original_user_id = user_ids[user_id]
        recommendations = recommend(original_user_id, model, num_recommendations=200)
        submission.append({
            'User': original_user_id,
            'Events': ' '.join(map(str, recommendations))
        })

    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('submission_TensorFlow.csv', index=False)
    print("Submission file created.")


ImportError: Dask dataframe requirements are not installed.

Please either conda or pip install as follows:

  conda install dask                     # either conda install
  python -m pip install "dask[dataframe]" --upgrade  # or python -m pip install

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load data using Pandas, specifying data types
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
users = pd.read_csv('users.csv', dtype={'birthyear': 'object', 'timezone': 'float64'})
user_friends = pd.read_csv('user_friends.csv.gz')
events = pd.read_csv('events.csv.gz', dtype={'city': 'object', 'country': 'object', 'state': 'object', 'zip': 'object'})
event_attendees = pd.read_csv('event_attendees.csv.gz')

# Preview the data
print("Train.csv:")
print(train.head(), "\n")

print("Test.csv:")
print(test.head(), "\n")

print("Users.csv:")
print(users.head(), "\n")

print("User_friends.csv:")
print(user_friends.head(), "\n")

print("Events.csv:")
print(events.head(), "\n")

print("Event_attendees.csv:")
print(event_attendees.head(), "\n")

# Data preprocessing
train['timestamp'] = pd.to_datetime(train['timestamp'], errors='coerce')
test['timestamp'] = pd.to_datetime(test['timestamp'], errors='coerce')
users['joinedAt'] = pd.to_datetime(users['joinedAt'], errors='coerce')
events['start_time'] = pd.to_datetime(events['start_time'], errors='coerce')

# EDA Analysis
print("Train Data Info")
print(train.info())

print("\nTest Data Info")
print(test.info())

print("\nUsers Data Info")
print(users.info())

print("\nUser Friends Data Info")
print(user_friends.info())

print("\nEvents Data Info")
print(events.info())

print("\nEvent Attendees Data Info")
print(event_attendees.info())

# Basic statistics
print("\nTrain Data Statistics")
print(train.describe())

print("\nTest Data Statistics")
print(test.describe())

print("\nUsers Data Statistics")
print(users.describe())

print("\nEvents Data Statistics")
print(events.describe())

# Visualizing missing values
def plot_missing_values(df, title):
    missing = df.isnull().mean()
    missing = missing[missing > 0]
    missing.sort_values(inplace=True)
    missing.plot.bar()
    plt.title(title)
    plt.show()

plot_missing_values(train, "Missing Values in Train Data")
plot_missing_values(test, "Missing Values in Test Data")
plot_missing_values(users, "Missing Values in Users Data")
plot_missing_values(events, "Missing Values in Events Data")

# Distribution of interested and not_interested in train data
train['interested'].value_counts().plot(kind='bar', title='Distribution of Interested and Not Interested')
plt.show()

# Creating user and event mappings
user_ids = users['user_id'].unique().tolist()
event_ids = train['event'].unique().tolist()

user_to_index = {x: i for i, x in enumerate(user_ids)}
event_to_index = {x: i for i, x in enumerate(event_ids)}

train['user'] = train['user'].map(user_to_index)
train['event'] = train['event'].map(event_to_index)
test['user'] = test['user'].map(user_to_index)
test['event'] = test['event'].map(event_to_index)

# Checking for valid users and events in the test set
test_users = test['user'].unique()
test_events = test['event'].unique()

valid_users = np.isin(test_users, list(user_to_index.values()))
valid_events = np.isin(test_events, list(event_to_index.values()))

print(f"Number of users in the test set: {len(test_users)}")
print(f"Number of events in the test set: {len(test_events)}")
print(f"Number of users in the test set also in the training set: {np.sum(valid_users)}")
print(f"Number of events in the test set also in the training set: {np.sum(valid_events)}")

# Filter the test data
test = test[test['user'].isin(user_to_index.values()) & test['event'].isin(event_to_index.values())]

# Check that the test set is not empty after filtering
if test.empty:
    print("The test set is empty after filtering. Ensure data is correct.")
else:
    num_users = len(user_ids)
    num_events = len(event_ids)

    # Prepare data for the model
    X = train[['user', 'event']].values
    y = train['interested'].values

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create matrix factorization model
    class RecommenderNet(tf.keras.Model):
        def __init__(self, num_users, num_events, embedding_size=50, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.user_embedding = tf.keras.layers.Embedding(
                num_users, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.event_embedding = tf.keras.layers.Embedding(
                num_events, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.user_bias = tf.keras.layers.Embedding(num_users, 1)
            self.event_bias = tf.keras.layers.Embedding(num_events, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            event_vector = self.event_embedding(inputs[:, 1])
            
            user_bias = self.user_bias(inputs[:, 0])
            event_bias = self.event_bias(inputs[:, 1])
            
            dot_user_event = tf.tensordot(user_vector, event_vector, 2)
            
            x = dot_user_event + user_bias + event_bias
            
            return tf.nn.sigmoid(x)

    # Model parameters
    embedding_size = 50

    model = RecommenderNet(num_users, num_events, embedding_size)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[tf.keras.metrics.AUC()]
    )

    # Train the model
    history = model.fit(
        x=X_train,
        y=y_train,
        batch_size=64,
        epochs=10,
        validation_data=(X_val, y_val)
    )

    # Function for making recommendations
    def recommend(user_id, model, num_recommendations=10):
        if user_id not in user_to_index:
            print(f"User ID {user_id} not found.")
            return []
            
        user_index = user_to_index[user_id]
        event_indices = np.arange(num_events)
        
        user_array = np.array([user_index] * num_events)
        event_array = event_indices
        
        predictions = model.predict(np.vstack([user_array, event_array]).T).flatten()
        
        top_indices = predictions.argsort()[-num_recommendations:][::-1]
        recommended_events = [event_ids[i] for i in top_indices]
        
        return recommended_events

    # Create the submission file
    submission = []

    for user_id in test['user'].unique():
        user_id = int(user_id)  # Convert to integer
        original_user_id = user_ids[user_id]
        recommendations = recommend(original_user_id, model, num_recommendations=200)
        submission.append({
            'User': original_user_id,
            'Events': ' '.join(map(str, recommendations))
        })

    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('submission_TensorFlow.csv', index=False)
    print("Submission file created.")


MemoryError: Unable to allocate 2.41 GiB for an array with shape (103, 3137972) and data type int64

In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Define the chunksize
chunksize = 1000  # Adjust this value based on your system's memory

# Load data in chunks and concatenate into a DataFrame
train = pd.read_csv('train.csv', chunksize=chunksize)
train = pd.concat(train)

test = pd.read_csv('test.csv', chunksize=chunksize)
test = pd.concat(test)

users = pd.read_csv('users.csv', dtype={'birthyear': 'object', 'timezone': 'float64'}, chunksize=chunksize)
users = pd.concat(users)

user_friends = pd.read_csv('user_friends.csv.gz', chunksize=chunksize)
user_friends = pd.concat(user_friends)

# Reading events.csv in chunks to avoid MemoryError
events = pd.read_csv('events.csv.gz', dtype={'city': 'object', 'country': 'object', 'state': 'object', 'zip': 'object'}, chunksize=chunksize)
events = pd.concat(events)

event_attendees = pd.read_csv('event_attendees.csv.gz', chunksize=chunksize)
event_attendees = pd.concat(event_attendees)

# Preview the data (first 5 rows from each dataset)
print("Train.csv:")
print(train.head(), "\n")

print("Test.csv:")
print(test.head(), "\n")

print("Users.csv:")
print(users.head(), "\n")

print("User_friends.csv:")
print(user_friends.head(), "\n")

print("Events.csv:")
print(events.head(), "\n")

print("Event_attendees.csv:")
print(event_attendees.head(), "\n")

# Data preprocessing
train['timestamp'] = pd.to_datetime(train['timestamp'], errors='coerce')
test['timestamp'] = pd.to_datetime(test['timestamp'], errors='coerce')
users['joinedAt'] = pd.to_datetime(users['joinedAt'], errors='coerce')
events['start_time'] = pd.to_datetime(events['start_time'], errors='coerce')

# EDA Analysis
print("Train Data Info")
print(train.info())

print("\nTest Data Info")
print(test.info())

print("\nUsers Data Info")
print(users.info())

print("\nUser Friends Data Info")
print(user_friends.info())

print("\nEvents Data Info")
print(events.info())

print("\nEvent Attendees Data Info")
print(event_attendees.info())

# Basic statistics
print("\nTrain Data Statistics")
print(train.describe())

print("\nTest Data Statistics")
print(test.describe())

print("\nUsers Data Statistics")
print(users.describe())

print("\nEvents Data Statistics")
print(events.describe())

# Visualizing missing values
def plot_missing_values(df, title):
    missing = df.isnull().mean()
    missing = missing[missing > 0]
    missing.sort_values(inplace=True)
    missing.plot.bar()
    plt.title(title)
    plt.show()

plot_missing_values(train, "Missing Values in Train Data")
plot_missing_values(test, "Missing Values in Test Data")
plot_missing_values(users, "Missing Values in Users Data")
plot_missing_values(events, "Missing Values in Events Data")

# Distribution of interested and not_interested in train data
train['interested'].value_counts().plot(kind='bar', title='Distribution of Interested and Not Interested')
plt.show()

# Creating user and event mappings
user_ids = users['user_id'].unique().tolist()
event_ids = train['event'].unique().tolist()

user_to_index = {x: i for i, x in enumerate(user_ids)}
event_to_index = {x: i for i, x in enumerate(event_ids)}

train['user'] = train['user'].map(user_to_index)
train['event'] = train['event'].map(event_to_index)
test['user'] = test['user'].map(user_to_index)
test['event'] = test['event'].map(event_to_index)

# Checking for valid users and events in the test set
test_users = test['user'].unique()
test_events = test['event'].unique()

valid_users = np.isin(test_users, list(user_to_index.values()))
valid_events = np.isin(test_events, list(event_to_index.values()))

print(f"Number of users in the test set: {len(test_users)}")
print(f"Number of events in the test set: {len(test_events)}")
print(f"Number of users in the test set also in the training set: {np.sum(valid_users)}")
print(f"Number of events in the test set also in the training set: {np.sum(valid_events)}")

# Filter the test data
test = test[test['user'].isin(user_to_index.values()) & test['event'].isin(event_to_index.values())]

# Check that the test set is not empty after filtering
if test.empty:
    print("The test set is empty after filtering. Ensure data is correct.")
else:
    num_users = len(user_ids)
    num_events = len(event_ids)

    # Prepare data for the model
    X = train[['user', 'event']].values
    y = train['interested'].values

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create matrix factorization model
    class RecommenderNet(tf.keras.Model):
        def __init__(self, num_users, num_events, embedding_size=50, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.user_embedding = tf.keras.layers.Embedding(
                num_users, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.event_embedding = tf.keras.layers.Embedding(
                num_events, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.user_bias = tf.keras.layers.Embedding(num_users, 1)
            self.event_bias = tf.keras.layers.Embedding(num_events, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            event_vector = self.event_embedding(inputs[:, 1])
            
            user_bias = self.user_bias(inputs[:, 0])
            event_bias = self.event_bias(inputs[:, 1])
            
            dot_user_event = tf.tensordot(user_vector, event_vector, 2)
            
            x = dot_user_event + user_bias + event_bias
            
            return tf.nn.sigmoid(x)

    # Model parameters
    embedding_size = 50

    model = RecommenderNet(num_users, num_events, embedding_size)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[tf.keras.metrics.AUC()]
    )

    # Train the model
    history = model.fit(
        x=X_train,
        y=y_train,
        batch_size=64,
        epochs=10,
        validation_data=(X_val, y_val)
    )

    # Function for making recommendations
    def recommend(user_id, model, num_recommendations=10):
        if user_id not in user_to_index:
            print(f"User ID {user_id} not found.")
            return []
            
        user_index = user_to_index[user_id]
        event_indices = np.arange(num_events)
        
        user_array = np.array([user_index] * num_events)
        event_array = event_indices
        
        predictions = model.predict(np.vstack([user_array, event_array]).T).flatten()
        
        top_indices = predictions.argsort()[-num_recommendations:][::-1]
        recommended_events = [event_ids[i] for i in top_indices]
        
        return recommended_events

    # Create the submission file
    submission = []

    for user_id in test['user'].unique():
        user_id = int(user_id)  # Convert to integer
        original_user_id = user_ids[user_id]
        recommendations = recommend(original_user_id, model, num_recommendations=200)
        submission.append({
            'User': original_user_id,
            'Events': ' '.join(map(str, recommendations))
        })

    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('submission_TensorFlow.csv', index=False)
    print("Submission file created.")


Train.csv:
      user       event  invited                         timestamp  interested  \
0  3044012  1918771225        0  2012-10-02 15:53:05.754000+00:00           0   
1  3044012  1502284248        0  2012-10-02 15:53:05.754000+00:00           0   
2  3044012  2529072432        0  2012-10-02 15:53:05.754000+00:00           1   
3  3044012  3072478280        0  2012-10-02 15:53:05.754000+00:00           0   
4  3044012  1390707377        0  2012-10-02 15:53:05.754000+00:00           0   

   not_interested  
0               0  
1               0  
2               0  
3               0  
4               0   

Test.csv:
      user       event  invited                         timestamp
0  1776192  2877501688        0  2012-11-30 11:39:01.230000+00:00
1  1776192  3025444328        0  2012-11-30 11:39:01.230000+00:00
2  1776192  4078218285        0  2012-11-30 11:39:01.230000+00:00
3  1776192  1024025121        0  2012-11-30 11:39:01.230000+00:00
4  1776192  2972428928        0  2012-11

MemoryError: Unable to allocate 2.41 GiB for an array with shape (103, 3137972) and data type int64

In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Define the chunksize to control memory usage
chunksize = 100000  # Adjust this based on your available memory

# Initialize lists to store processed chunks if you want to accumulate the results
train_data = []
test_data = []
users_data = []
user_friends_data = []
events_data = []
event_attendees_data = []

# Load and process train.csv in chunks
for chunk in pd.read_csv('train.csv', chunksize=chunksize):
    chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], errors='coerce')
    train_data.append(chunk)

# Load and process test.csv in chunks
for chunk in pd.read_csv('test.csv', chunksize=chunksize):
    chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], errors='coerce')
    test_data.append(chunk)

# Load and process users.csv in chunks
for chunk in pd.read_csv('users.csv', dtype={'birthyear': 'object', 'timezone': 'float64'}, chunksize=chunksize):
    chunk['joinedAt'] = pd.to_datetime(chunk['joinedAt'], errors='coerce')
    users_data.append(chunk)

# Load and process user_friends.csv.gz in chunks
for chunk in pd.read_csv('user_friends.csv.gz', chunksize=chunksize):
    user_friends_data.append(chunk)

# Load and process events.csv.gz in chunks
for chunk in pd.read_csv('events.csv.gz', dtype={'city': 'object', 'country': 'object', 'state': 'object', 'zip': 'object'}, chunksize=chunksize):
    chunk['start_time'] = pd.to_datetime(chunk['start_time'], errors='coerce')
    events_data.append(chunk)

# Load and process event_attendees.csv.gz in chunks
for chunk in pd.read_csv('event_attendees.csv.gz', chunksize=chunksize):
    event_attendees_data.append(chunk)

# Combine the chunks into a single DataFrame (if needed)
train_df = pd.concat(train_data, ignore_index=True)
test_df = pd.concat(test_data, ignore_index=True)
users_df = pd.concat(users_data, ignore_index=True)
user_friends_df = pd.concat(user_friends_data, ignore_index=True)
events_df = pd.concat(events_data, ignore_index=True)
event_attendees_df = pd.concat(event_attendees_data, ignore_index=True)

# Preview the data
print("Train.csv:")
print(train_df.head(), "\n")

print("Test.csv:")
print(test_df.head(), "\n")

print("Users.csv:")
print(users_df.head(), "\n")

print("User_friends.csv:")
print(user_friends_df.head(), "\n")

print("Events.csv:")
print(events_df.head(), "\n")

print("Event_attendees.csv:")
print(event_attendees_df.head(), "\n")

# Basic statistics and analysis
print("\nTrain Data Statistics")
print(train_df.describe())

print("\nTest Data Statistics")
print(test_df.describe())

print("\nUsers Data Statistics")
print(users_df.describe())

print("\nEvents Data Statistics")
print(events_df.describe())

# Visualizing missing values
def plot_missing_values(df, title):
    missing = df.isnull().mean()
    missing = missing[missing > 0]
    missing.sort_values(inplace=True)
    missing.plot.bar()
    plt.title(title)
    plt.show()

plot_missing_values(train_df, "Missing Values in Train Data")
plot_missing_values(test_df, "Missing Values in Test Data")
plot_missing_values(users_df, "Missing Values in Users Data")
plot_missing_values(events_df, "Missing Values in Events Data")

# Distribution of interested and not_interested in train data
train_df['interested'].value_counts().plot(kind='bar', title='Distribution of Interested and Not Interested')
plt.show()

# Creating user and event mappings
user_ids = users_df['user_id'].unique().tolist()
event_ids = train_df['event'].unique().tolist()

user_to_index = {x: i for i, x in enumerate(user_ids)}
event_to_index = {x: i for i, x in enumerate(event_ids)}

train_df['user'] = train_df['user'].map(user_to_index)
train_df['event'] = train_df['event'].map(event_to_index)
test_df['user'] = test_df['user'].map(user_to_index)
test_df['event'] = test_df['event'].map(event_to_index)

# Checking for valid users and events in the test set
test_users = test_df['user'].unique()
test_events = test_df['event'].unique()

valid_users = np.isin(test_users, list(user_to_index.values()))
valid_events = np.isin(test_events, list(event_to_index.values()))

print(f"Number of users in the test set: {len(test_users)}")
print(f"Number of events in the test set: {len(test_events)}")
print(f"Number of users in the test set also in the training set: {np.sum(valid_users)}")
print(f"Number of events in the test set also in the training set: {np.sum(valid_events)}")

# Filter the test data
test_df = test_df[test_df['user'].isin(user_to_index.values()) & test_df['event'].isin(event_to_index.values())]

# Check that the test set is not empty after filtering
if test_df.empty:
    print("The test set is empty after filtering. Ensure data is correct.")
else:
    num_users = len(user_ids)
    num_events = len(event_ids)

    # Prepare data for the model
    X = train_df[['user', 'event']].values
    y = train_df['interested'].values

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create matrix factorization model
    class RecommenderNet(tf.keras.Model):
        def __init__(self, num_users, num_events, embedding_size=50, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.user_embedding = tf.keras.layers.Embedding(
                num_users, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.event_embedding = tf.keras.layers.Embedding(
                num_events, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.user_bias = tf.keras.layers.Embedding(num_users, 1)
            self.event_bias = tf.keras.layers.Embedding(num_events, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            event_vector = self.event_embedding(inputs[:, 1])
            
            user_bias = self.user_bias(inputs[:, 0])
            event_bias = self.event_bias(inputs[:, 1])
            
            dot_user_event = tf.tensordot(user_vector, event_vector, 2)
            
            x = dot_user_event + user_bias + event_bias
            
            return tf.nn.sigmoid(x)

    # Model parameters
    embedding_size = 50

    model = RecommenderNet(num_users, num_events, embedding_size)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[tf.keras.metrics.AUC()]
    )

    # Train the model
    history = model.fit(
        x=X_train,
        y=y_train,
        batch_size=64,
        epochs=10,
        validation_data=(X_val, y_val)
    )

    # Function for making recommendations
    def recommend(user_id, model, num_recommendations=10):
        if user_id not in user_to_index:
            print(f"User ID {user_id} not found.")
            return []
            
        user_index = user_to_index[user_id]
        event_indices = np.arange(num_events)
        
        user_array = np.array([user_index] * num_events)
        event_array = event_indices
        
        predictions = model.predict(np.vstack([user_array, event_array]).T).flatten()
        
        top_indices = predictions.argsort()[-num_recommendations:][::-1]
        recommended_events = [event_ids[i] for i in top_indices]
        
        return recommended_events

    # Create the submission file
    submission = []

    for user_id in test_df['user'].unique():
        user_id = int(user_id)  # Convert to integer
        original_user_id = user_ids[user_id]
        recommendations = recommend(original_user_id, model, num_recommendations=200)
        submission.append({
            'User': original_user_id,
            'Events': ' '.join(map(str, recommendations))
        })

    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('submission_TensorFlow.csv', index=False)
    print("Submission file created.")


ParserError: Error tokenizing data. C error: out of memory

In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import csv
import gzip

# Define the chunksize to control memory usage
chunksize = 50000  # Reduced chunksize

# Initialize lists to store processed chunks if you want to accumulate the results
train_data = []
test_data = []
users_data = []
user_friends_data = []
events_data = []
event_attendees_data = []

# Load and process train.csv in chunks
for chunk in pd.read_csv('train.csv', chunksize=chunksize, iterator=True):
    chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], errors='coerce')
    train_data.append(chunk)

# Load and process test.csv in chunks
for chunk in pd.read_csv('test.csv', chunksize=chunksize, iterator=True):
    chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], errors='coerce')
    test_data.append(chunk)

# Load and process users.csv in chunks
for chunk in pd.read_csv('users.csv', dtype={'birthyear': 'object', 'timezone': 'float64'}, chunksize=chunksize, iterator=True):
    chunk['joinedAt'] = pd.to_datetime(chunk['joinedAt'], errors='coerce')
    users_data.append(chunk)

# Load and process user_friends.csv.gz using gzip
with gzip.open('user_friends.csv.gz', mode='rt', encoding='utf-8') as file:
    reader = csv.reader(file)
    header = next(reader)  # Skip header
    for row in reader:
        user_friends_data.append(row)

# Load and process events.csv.gz in chunks
for chunk in pd.read_csv('events.csv.gz', dtype={'city': 'object', 'country': 'object', 'state': 'object', 'zip': 'object'}, chunksize=chunksize, iterator=True):
    chunk['start_time'] = pd.to_datetime(chunk['start_time'], errors='coerce')
    events_data.append(chunk)

# Load and process event_attendees.csv.gz in chunks
for chunk in pd.read_csv('event_attendees.csv.gz', chunksize=chunksize, iterator=True):
    event_attendees_data.append(chunk)

# Combine the chunks into a single DataFrame (if needed)
train_df = pd.concat(train_data, ignore_index=True)
test_df = pd.concat(test_data, ignore_index=True)
users_df = pd.concat(users_data, ignore_index=True)
events_df = pd.concat(events_data, ignore_index=True)
event_attendees_df = pd.concat(event_attendees_data, ignore_index=True)

# Process the user_friends_data manually if necessary, as it's handled by csv reader
user_friends_df = pd.DataFrame(user_friends_data, columns=header)

# Preview the data
print("Train.csv:")
print(train_df.head(), "\n")

print("Test.csv:")
print(test_df.head(), "\n")

print("Users.csv:")
print(users_df.head(), "\n")

print("User_friends.csv:")
print(user_friends_df.head(), "\n")

print("Events.csv:")
print(events_df.head(), "\n")

print("Event_attendees.csv:")
print(event_attendees_df.head(), "\n")

# The rest of your code for data processing, model training, etc. goes here...


# The rest of your code for data processing, model training, etc. goes here...
# Visualizing missing values
def plot_missing_values(df, title):
    missing = df.isnull().mean()
    missing = missing[missing > 0]
    missing.sort_values(inplace=True)
    missing.plot.bar()
    plt.title(title)
    plt.show()

plot_missing_values(train_df, "Missing Values in Train Data")
plot_missing_values(test_df, "Missing Values in Test Data")
plot_missing_values(users_df, "Missing Values in Users Data")
plot_missing_values(events_df, "Missing Values in Events Data")

# Distribution of interested and not_interested in train data
train_df['interested'].value_counts().plot(kind='bar', title='Distribution of Interested and Not Interested')
plt.show()

# Creating user and event mappings
user_ids = users_df['user_id'].unique().tolist()
event_ids = train_df['event'].unique().tolist()

user_to_index = {x: i for i, x in enumerate(user_ids)}
event_to_index = {x: i for i, x in enumerate(event_ids)}

train_df['user'] = train_df['user'].map(user_to_index)
train_df['event'] = train_df['event'].map(event_to_index)
test_df['user'] = test_df['user'].map(user_to_index)
test_df['event'] = test_df['event'].map(event_to_index)

# Checking for valid users and events in the test set
test_users = test_df['user'].unique()
test_events = test_df['event'].unique()

valid_users = np.isin(test_users, list(user_to_index.values()))
valid_events = np.isin(test_events, list(event_to_index.values()))

print(f"Number of users in the test set: {len(test_users)}")
print(f"Number of events in the test set: {len(test_events)}")
print(f"Number of users in the test set also in the training set: {np.sum(valid_users)}")
print(f"Number of events in the test set also in the training set: {np.sum(valid_events)}")

# Filter the test data
test_df = test_df[test_df['user'].isin(user_to_index.values()) & test_df['event'].isin(event_to_index.values())]

# Check that the test set is not empty after filtering
if test_df.empty:
    print("The test set is empty after filtering. Ensure data is correct.")
else:
    num_users = len(user_ids)
    num_events = len(event_ids)

    # Prepare data for the model
    X = train_df[['user', 'event']].values
    y = train_df['interested'].values

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create matrix factorization model
    class RecommenderNet(tf.keras.Model):
        def __init__(self, num_users, num_events, embedding_size=50, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.user_embedding = tf.keras.layers.Embedding(
                num_users, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.event_embedding = tf.keras.layers.Embedding(
                num_events, embedding_size,
                embeddings_initializer='he_normal',
                embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
            )
            self.user_bias = tf.keras.layers.Embedding(num_users, 1)
            self.event_bias = tf.keras.layers.Embedding(num_events, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            event_vector = self.event_embedding(inputs[:, 1])
            
            user_bias = self.user_bias(inputs[:, 0])
            event_bias = self.event_bias(inputs[:, 1])
            
            dot_user_event = tf.tensordot(user_vector, event_vector, 2)
            
            x = dot_user_event + user_bias + event_bias
            
            return tf.nn.sigmoid(x)

    # Model parameters
    embedding_size = 50

    model = RecommenderNet(num_users, num_events, embedding_size)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[tf.keras.metrics.AUC()]
    )

    # Train the model
    history = model.fit(
        x=X_train,
        y=y_train,
        batch_size=64,
        epochs=10,
        validation_data=(X_val, y_val)
    )

    # Function for making recommendations
    def recommend(user_id, model, num_recommendations=10):
        if user_id not in user_to_index:
            print(f"User ID {user_id} not found.")
            return []
            
        user_index = user_to_index[user_id]
        event_indices = np.arange(num_events)
        
        user_array = np.array([user_index] * num_events)
        event_array = event_indices
        
        predictions = model.predict(np.vstack([user_array, event_array]).T).flatten()
        
        top_indices = predictions.argsort()[-num_recommendations:][::-1]
        recommended_events = [event_ids[i] for i in top_indices]
        
        return recommended_events

    # Create the submission file
    submission = []

    for user_id in test_df['user'].unique():
        user_id = int(user_id)  # Convert to integer
        original_user_id = user_ids[user_id]
        recommendations = recommend(original_user_id, model, num_recommendations=200)
        submission.append({
            'User': original_user_id,
            'Events': ' '.join(map(str, recommendations))
        })

    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('submission_TensorFlow.csv', index=False)
    print("Submission file created.")


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte