In [None]:
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torch

import networkx as nx
from multiprocessing import Pool
import igraph as ig
from joblib import Parallel, delayed


print("PyTorch version:", torch.__version__)
print("Is CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda if torch.cuda.is_available() else "Not available")
print("Number of GPUs:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if not torch.cuda.is_available():
    print("CUDA is not available. Check your PyTorch installation and GPU setup.")


# Download some NLP models for processing, optional
nltk.download('stopwords')
nltk.download('wordnet')
# Load GloVe model with Gensim's API
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'http\S+|www\S+|[^\w\s]|[\d+]', '', text.lower())
    words = text.split()  # Tokenisation
    words = [word for word in words if word not in stop_words]  # Suppression des stopwords
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


def tweet_to_igraph(tweet):
    words = tweet.split()
    if not words:  # Vérifiez si le tweet est vide
        return ig.Graph(directed=False)

    graph = ig.Graph(directed=False)
    
    # Ajouter des nœuds
    unique_words = list(set(words))
    graph.add_vertices(unique_words)
    
    # Ajouter des arêtes avec des poids
    edge_weights = {}
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            if i != j:  # Éviter les boucles
                edge = tuple(sorted([word1, word2]))
                edge_weights[edge] = edge_weights.get(edge, 0) + 1

    edges, weights = zip(*edge_weights.items()) if edge_weights else ([], [])
    graph.add_edges(edges)
    graph.es['weight'] = weights
    
    return graph


def combined_igraph_each_period(tweets):
    combined_graph = ig.Graph(directed=False)
    
    # Ajout progressif des tweets au graphe combiné
    for tweet in tweets:
        tweet_graph = tweet_to_igraph(tweet)
        
        # Ajouter les nœuds et les arêtes du tweet au graphe combiné
        new_vertices = set(tweet_graph.vs['name']) - set(combined_graph.vs['name'])
        combined_graph.add_vertices(list(new_vertices))
        
        for edge, weight in zip(tweet_graph.get_edgelist(), tweet_graph.es['weight']):
            if combined_graph.are_connected(*edge):
                # Incrémentez le poids si l'arête existe déjà
                eid = combined_graph.get_eid(*edge)
                combined_graph.es[eid]['weight'] += weight
            else:
                # Sinon, ajoutez une nouvelle arête
                combined_graph.add_edge(*edge, weight=weight)
    
    return combined_graph




    



def process_period_with_label(period, group):
    tweets = group['Tweet'].tolist()
    if not tweets:  
        return period, ig.Graph(directed=False)  
    
    try:
        graph = combined_igraph_each_period(tweets)
    except Exception as e:
        print(f"Erreur lors du traitement de la période {period}: {e}")
        graph = ig.Graph(directed=False)  
    return period, graph


def extract_igraph_features(ig_graph):
    if ig_graph.vcount() == 0 or ig_graph.ecount() == 0:  # if empty graph
        return {
            "average_degree": 0,
            "degree_std": 0,
            "density": 0,
            "diameter": 0,
            "average_clustering": 0,
            "num_connected_components": 0,
        }

    degrees = ig_graph.degree()
    
    clustering = ig_graph.transitivity_local_undirected(vertices=None)
    average_clustering = np.mean([c for c in clustering if not np.isnan(c)]) if clustering else 0

    features = {
        "average_degree": np.mean(degrees),
        "degree_std": np.std(degrees),
        "density": ig_graph.density(),
        "diameter": ig_graph.diameter() if ig_graph.is_connected() else 0,
        "average_clustering": average_clustering,
        "num_connected_components": len(ig_graph.clusters()),
    }
    return features

# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("train_tweets"):
    df = pd.read_csv("train_tweets/" + filename)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'] / 1000, unit='s')  # Diviser par 1000 pour convertir en secondes

    li.append(df)

df = pd.concat(li, ignore_index=True)

# Apply preprocessing to each tweet
print("Début du prétraitement des tweets...")
df['Tweet'] = df['Tweet'].apply(preprocess_text)
print("Prétraitement terminé.")



periods = df.groupby(pd.Grouper(key='Timestamp', freq='min'))
igraph_dict = dict(
    Parallel(n_jobs=-1)(
        delayed(process_period_with_label)(period, group) for period, group in periods
    )
)
graph_features = {period: extract_igraph_features(ig_graph) for period, ig_graph in igraph_dict.items()}
features_df = pd.DataFrame.from_dict(graph_features, orient="index")
features_df['EventType'] = df.groupby(pd.Grouper(key='Timestamp', freq='min'))['EventType'].first()
features_df = features_df.dropna()

X = features_df.drop(columns=['EventType']).values
y = features_df['EventType'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#preditions
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy method1: ", accuracy_score(y_test, y_pred))


clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

predictions = []
dummy_predictions = []

for fname in os.listdir("eval_tweets"):
    print(f"Processing file: {fname}")
    val_df = pd.read_csv("eval_tweets/" + fname)
    val_df['Timestamp'] = pd.to_datetime(val_df['Timestamp'] / 1000, unit='s')  # Conversion du timestamp
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)

    periods = val_df.groupby(pd.Grouper(key='Timestamp', freq='min'))

    igraph_dict_eval = dict(
        Parallel(n_jobs=-1)(
            delayed(process_period_with_label)(period, group) for period, group in periods
        )
    )

    graph_features_eval = {
        period: extract_igraph_features(ig_graph)
        for period, ig_graph in igraph_dict_eval.items()
    }

    features_eval_df = pd.DataFrame.from_dict(graph_features_eval, orient="index")
    features_eval_df['ID'] = val_df.groupby(pd.Grouper(key='Timestamp', freq='min'))['ID'].first()

    # Suppression des NaN
    features_eval_df = features_eval_df.dropna()

    X_eval = features_eval_df.drop(columns=['ID']).values

    preds = clf.predict(X_eval)
    dummy_preds = dummy_clf.predict(X_eval)

    features_eval_df['EventType'] = preds
    features_eval_df['DummyEventType'] = dummy_preds

    predictions.append(features_eval_df[['ID', 'EventType']])
    dummy_predictions.append(features_eval_df[['ID', 'DummyEventType']])

pred_df = pd.concat(predictions)
pred_df.to_csv('graph_predictions.csv', index=False)

dummy_pred_df = pd.concat(dummy_predictions)
dummy_pred_df.to_csv('dummy_predictions.csv', index=False)