

In [111]:
import pandas as pd

In [328]:
dataset = pd.read_csv('most_popular_repositories_1000.csv')

dataset['Description'] = dataset['Description'].fillna('')
dataset['description_length'] = dataset['Description'].apply(len)

columns_to_drop = ['Name', 'FullName', 'HtmlUrl', 'Description', 'Language','OwnerLogin', 'OwnerType', 'CreatedAt', 'UpdatedAt', 'PushedAt', 'License','WatchersCount']

dataset = dataset.drop(columns=columns_to_drop)

ids = dataset['Id']
dataset = dataset.drop(columns = ['Id'])



print(dataset.head())

   OpenIssuesCount  ForksCount  StargazersCount    Size  description_length
0               43       27855           331707    1525                  53
1              377       33776           316814    5030                  30
2              338       28727           306876    1058                  73
3               55       76810           306451   22318                  69
4               41       39078           296377  243999                 100


In [27]:
import os
import pandas as pd
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import time
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error  # for regression tasks
import numpy as np

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(4, 32)  # Assuming 3 input features
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 32) 
        self.fc4 = nn.Linear(32, 1) 

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # ReLU activation for hidden layers
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x


class CSVHandler(FileSystemEventHandler):
    def __init__(self, directory):
        self.directory = directory
        self.processed_count = 2
        self.train_file = "train.csv"
        self.val_file = "validation.csv"
        self.test_file = "test.csv"

    def on_created(self, event):
        if event.src_path.endswith('.csv'):
            print(f"New CSV detected: {event.src_path}")
            self.process_csv(event.src_path)

    def process_csv(self, file_path):
        try:
            # Read the CSV file
            data = pd.read_csv(file_path)
            print(f"Processing CSV file ...")  # Example: print first 5 rows

            #initialise the model
            model = SimpleNN()

            #preprocess the data
            print("pre-processing the data...")
            dataset = self.pre_process(data)


            if self.processed_count == 0:
                print(f"Processing to get unique 400 data...")  # Custom logic for the first 3 files
                self.add_to_train(dataset)


            if self.processed_count == 1:
                print("Creating validation dataset...")
                self.add_to_validation(dataset)


            if self.processed_count == 2:
                print("Getting training data and validation data and normalising...")
                train_data = pd.read_csv(self.train_file)
                validation_data = pd.read_csv(self.val_file)

                train_len = len(train_data)

                print("Combine the data for normalization...")
                combined_data = pd.concat([train_data, validation_data])

                print("Normalising...")
                combined_data = self.normalise(combined_data)

                n_train = combined_data.iloc[:train_len].reset_index(drop=True)
                n_validation = combined_data.iloc[train_len:].reset_index(drop=True)

                print("Training the model...")
                self.train_the_model(n_train,n_validation, model)
                self.processed_count = 3
                print("Model is trained!!!")
                print("Ready to predict the repo trend...")
            
            if self.processed_count == 3:
                print("Trend predictor running ...")
                self.add_to_test(dataset)
                popularity_score = self.predict_the_trend(model)
                self.sort_by_popularity(popularity_score)

                
            print("Waiting for next file...")

        except Exception as e:
            print(f"Failed to process : {e}")

    def sort_by_popularity(self, popularity_score):
        test_file = "test.csv"
        test_data = pd.read_csv(test_file)

        test_data['popularity_score'] = popularity_score
        sorted_data = test_data.sort_values(by='popularity_score', ascending=False)

        sorted_data.to_csv("sorted_test.csv", index=False)
        return

    def pre_process(self, dataset):
        dataset['Description'] = dataset['Description'].fillna('')
        dataset['description_length'] = dataset['Description'].apply(len)

        columns_to_drop = ['FullName', 'HtmlUrl', 'Description', 'Language','OwnerLogin', 'OwnerType', 'CreatedAt', 'UpdatedAt', 'PushedAt', 'License','WatchersCount']

        dataset = dataset.drop(columns=columns_to_drop)

        return dataset
    

    
    def add_to_train(self, dataset):
        try:
            if os.path.exists(self.train_file):
                train_data = pd.read_csv(self.train_file)
                print("extracting from train.csv...")
            else:
                train_data = pd.DataFrame()

            combined_data = pd.concat([train_data, dataset]).drop_duplicates()

            combined_data.to_csv(self.train_file, index=False)
            print(f"Length of train.csv {len(combined_data)}")

            # Limit to 400 unique rows
            if len(combined_data) > 400:
                combined_data = combined_data.iloc[:400]
                self.processed_count = 1
                print("Train.csv has reached its maximum capacity of 400 unique rows.")
        except Exception as e:
            print(f"Error updating {self.train_file}: {e}")



    def add_to_validation(self,dataset):
        try:
            if os.path.exists(self.val_file):
                val_data = pd.read_csv(self.val_file)
            else:
                val_data = pd.DataFrame()

            combined_data = pd.concat([val_data, dataset]).drop_duplicates()
            combined_data.to_csv(self.val_file, index=False)
            print(f"length of validation.csv {len(combined_data)}")

            # Limit to 100 unique rows
            if len(combined_data) > 100:
                combined_data = combined_data.iloc[:100]
                self.processed_count = 2
                print("Validation.csv has reached its maximum capacity of 100 unique rows.")
               
        except Exception as e:
            print(f"Error updating {self.val_file}: {e}")

    def add_to_test(self,dataset):
        try:
            test_data = pd.DataFrame()

            combined_data = pd.concat([test_data, dataset]).drop_duplicates()
            combined_data.to_csv(self.test_file, index=False)
        except Exception as e:
            print(f"Error updating {self.test_file}: {e}")



    def train_the_model(self, train_dataset, validation_dataset, model):
        try:
            scaler = MinMaxScaler()

            #Tranform training data for model
            X_train = scaler.fit_transform(train_dataset[['OpenIssuesCount', 'ForksCount', 'Size', 'description_length']])
            y_train = train_dataset['StargazersCount'].values

            X_train = torch.tensor(X_train, dtype=torch.float32)
            y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

            y_train_scaled = scaler.fit_transform(y_train.reshape(-1, 1))

            #Transform validation data for model
            X_val = scaler.fit_transform(validation_dataset[['OpenIssuesCount', 'ForksCount', 'Size', 'description_length']])
            y_val = validation_dataset['StargazersCount'].values

            X_val = torch.tensor(X_val, dtype=torch.float32)
            y_val = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

            y_val_scaled = scaler.fit_transform(y_val.reshape(-1, 1))

            #model
            criterion = nn.MSELoss()
            optimizer = optim.Adam(model.parameters(), lr=0.0001)
            
            self.train(model, X_train, y_train_scaled, X_val, y_val_scaled, criterion, optimizer, 1000)
        except Exception as e:
            print(f"Error in Model training :( {e}")
        return
    
    def predict_the_trend(self,model):
        try:
            scaler = MinMaxScaler()

            test_data = pd.read_csv(self.test_file)
            n_test = self.normalise(test_data)

            #Transform testing data for model
            X_test = scaler.fit_transform(n_test[['OpenIssuesCount', 'ForksCount', 'Size', 'description_length']])
            y_test = n_test['StargazersCount'].values

            X_test = torch.tensor(X_test, dtype=torch.float32)
            y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

            y_test = scaler.fit_transform(y_test.reshape(-1, 1))

            y_test_pred_scaled = model(X_test)

            y_test_pred  = scaler.inverse_transform(y_test_pred_scaled.detach().numpy())
            print("Predictions are : ",y_test_pred[:10])
            return y_test_pred


        except Exception as e:
            print(f"Error in predicting the trend {e}")

    def normalise(self, dataset):
        columns_to_normalize = ['OpenIssuesCount', 'ForksCount', 'Size', 'description_length']
        data_to_normalize = dataset[columns_to_normalize]

        scaler = MinMaxScaler()
        normalized_data = scaler.fit_transform(data_to_normalize)

        dataset[columns_to_normalize] = normalized_data

        return dataset
    

    def train(self, model, X_train, y_train, X_val, y_val, criterion, optimizer, epochs):
        for epoch in range(epochs):
            model.train()
            
            # Convert to PyTorch tensors
            X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
            y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)  # Reshape to column vector
            
            optimizer.zero_grad()  # Zero gradients
            predictions = model(X_train_tensor)  # Forward pass
            loss = criterion(predictions, y_train_tensor)  # Compute loss
            
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights
            
            # Validation step
            if (epoch + 1) % 100 == 0:
                model.eval()
                with torch.no_grad():
                    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
                    y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)
                    val_predictions = model(X_val_tensor)
                    val_loss = criterion(val_predictions, y_val_tensor)

                # Print losses for each epoch
                print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")
    

if __name__ == "__main__":
    watch_directory = "./kafka_streaming/csv_data"  # Specify your directory containing CSV files

    if not os.path.exists(watch_directory):
        os.makedirs(watch_directory)

    event_handler = CSVHandler(watch_directory)
    observer = Observer()
    observer.schedule(event_handler, watch_directory, recursive=False)

    print(f"Monitoring directory: {watch_directory} for new CSV files...")
    try:
        observer.start()
        while True:
            time.sleep(5)  # Keeps the script running
    except KeyboardInterrupt:
        observer.stop()
    observer.join()


Monitoring directory: ./kafka_streaming/csv_data for new CSV files...
New CSV detected: /Users/shivamraj/Desktop/532/532_project/Github-repo-trend-predictor/kafka_streaming/csv_data/stream_2024-12-07_01-25-40.csv
Processing CSV file ...
pre-processing the data...
Getting training data and validation data and normalising...
Combine the data for normalization...
Normalising...
Training the model...
Epoch [100/1000], Train Loss: 0.0548, Val Loss: 0.0278
Epoch [200/1000], Train Loss: 0.0455, Val Loss: 0.0254


  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
  X_val_tensor = torch.tensor(X_val, dtype=torch.float32)


Epoch [300/1000], Train Loss: 0.0327, Val Loss: 0.0182
Epoch [400/1000], Train Loss: 0.0198, Val Loss: 0.0150
Epoch [500/1000], Train Loss: 0.0126, Val Loss: 0.0177
Epoch [600/1000], Train Loss: 0.0103, Val Loss: 0.0219
Epoch [700/1000], Train Loss: 0.0094, Val Loss: 0.0238
Epoch [800/1000], Train Loss: 0.0089, Val Loss: 0.0243
Epoch [900/1000], Train Loss: 0.0084, Val Loss: 0.0244
Epoch [1000/1000], Train Loss: 0.0081, Val Loss: 0.0243
Model is trained!!!
Ready to predict the repo trend...
Trend predictor running ...
Predictions are :  [[239824.45]
 [299799.28]
 [239828.06]
 [252431.36]
 [193811.95]
 [180798.25]
 [155708.62]
 [120244.23]
 [166583.83]
 [132939.03]]
Waiting for next file...
New CSV detected: /Users/shivamraj/Desktop/532/532_project/Github-repo-trend-predictor/kafka_streaming/csv_data/stream_2024-12-07_01-27-43.csv
Processing CSV file ...
pre-processing the data...
Trend predictor running ...
Predictions are :  [[59577.707]
 [57878.887]
 [57765.953]
 [58208.46 ]
 [58039.

In [329]:
#normalise



columns_to_normalize = ['OpenIssuesCount', 'ForksCount', 'Size', 'description_length']
data_to_normalize = dataset[columns_to_normalize]

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data_to_normalize)

dataset[columns_to_normalize] = normalized_data

print(dataset.head())

   OpenIssuesCount  ForksCount  StargazersCount      Size  description_length
0         0.001405    0.323685           331707  0.000062                  53
1         0.012317    0.393350           316814  0.000204                  30
2         0.011043    0.333945           306876  0.000043                  73
3         0.001797    0.899674           306451  0.000905                  69
4         0.001340    0.455732           296377  0.009895                 100


In [330]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X = scaler.fit_transform(dataset[['OpenIssuesCount', 'ForksCount', 'Size', 'description_length']])
Y = dataset['StargazersCount'].values


X_train, X_temp, y_train, y_temp, ids_train, ids_temp = train_test_split( X, Y, ids, test_size=0.4, random_state=42)


X_val, X_test, y_val, y_test, ids_val, ids_test = train_test_split(X_temp, y_temp, ids_temp, test_size=0.5, random_state=42)

print(X_train[:5])

[[3.16920966e-01 2.03640300e-01 1.53213288e-02 4.62724936e-03]
 [9.18090633e-03 1.36811267e-01 7.17448832e-03 3.42759212e-03]
 [8.03737707e-03 1.09067806e-02 1.97009079e-04 1.74807198e-02]
 [5.47913876e-02 5.25219724e-02 9.55433205e-03 9.42587832e-03]
 [2.71179795e-03 2.26724554e-02 4.05129828e-05 6.34104542e-03]]


In [331]:
scaler_target = MinMaxScaler()

# Fit the scaler on the training target and transform it
y_train_scaled = scaler_target.fit_transform(y_train.reshape(-1, 1))

# Apply the same transformation to the validation set
y_val_scaled = scaler_target.transform(y_val.reshape(-1, 1))

y_test_scaled = scaler_target.transform(y_test.reshape(-1, 1))


In [332]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error  # for regression tasks
import numpy as np

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)  # reshape for single output
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

print(X_train[:5])

tensor([[3.1692e-01, 2.0364e-01, 1.5321e-02, 4.6272e-03],
        [9.1809e-03, 1.3681e-01, 7.1745e-03, 3.4276e-03],
        [8.0374e-03, 1.0907e-02, 1.9701e-04, 1.7481e-02],
        [5.4791e-02, 5.2522e-02, 9.5543e-03, 9.4259e-03],
        [2.7118e-03, 2.2672e-02, 4.0513e-05, 6.3410e-03]])


In [347]:
model = SimpleNN()
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [348]:
def train(model, X_train, y_train, X_val, y_val, epochs):
    for epoch in range(epochs):
        model.train()
        
        # Convert to PyTorch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)  # Reshape to column vector
        
        optimizer.zero_grad()  # Zero gradients
        predictions = model(X_train_tensor)  # Forward pass
        loss = criterion(predictions, y_train_tensor)  # Compute loss
        
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
        # Validation step
        if (epoch + 1) % 100 == 0:
            model.eval()
            with torch.no_grad():
                X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
                y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)
                val_predictions = model(X_val_tensor)
                val_loss = criterion(val_predictions, y_val_tensor)

            # Print losses for each epoch
            print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")


In [349]:
train(model, X_train, y_train_scaled, X_val, y_val_scaled, epochs=1000)

  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
  X_val_tensor = torch.tensor(X_val, dtype=torch.float32)


Epoch [100/1000], Train Loss: 0.0102, Val Loss: 0.0132
Epoch [200/1000], Train Loss: 0.0099, Val Loss: 0.0127
Epoch [300/1000], Train Loss: 0.0093, Val Loss: 0.0121
Epoch [400/1000], Train Loss: 0.0083, Val Loss: 0.0109
Epoch [500/1000], Train Loss: 0.0070, Val Loss: 0.0091
Epoch [600/1000], Train Loss: 0.0058, Val Loss: 0.0074
Epoch [700/1000], Train Loss: 0.0053, Val Loss: 0.0064
Epoch [800/1000], Train Loss: 0.0051, Val Loss: 0.0062
Epoch [900/1000], Train Loss: 0.0049, Val Loss: 0.0062
Epoch [1000/1000], Train Loss: 0.0048, Val Loss: 0.0063


In [350]:
# Assuming you have test data X_test
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
model.eval()
with torch.no_grad():
    y_test_pred_scaled = model(X_test_tensor)
    
# Convert the predictions back to the original scale of the target variable
y_test_pred = scaler_target.inverse_transform(y_test_pred_scaled.detach().numpy())


  X_test_tensor = torch.tensor(X_test, dtype=torch.float32)


In [351]:
print(y_test_pred[:10])
print(y_test[:10])

[[ 32586.96 ]
 [ 26454.213]
 [ 32164.516]
 [147362.56 ]
 [ 33257.246]
 [ 25545.281]
 [ 50125.402]
 [ 30236.158]
 [ 26890.11 ]
 [ 30572.23 ]]
tensor([[21064.],
        [23680.],
        [25810.],
        [78940.],
        [45386.],
        [32945.],
        [48390.],
        [22882.],
        [22688.],
        [23864.]])


In [339]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming the model is already trained, and you've made predictions on the test set (X_test)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
model.eval()
with torch.no_grad():
    y_test_pred_scaled = model(X_test_tensor)

# Inverse transform the predictions back to original scale if you normalized the target
y_test_pred = scaler_target.inverse_transform(y_test_pred_scaled.detach().numpy())

# # If you normalized your test labels as well, do the same for y_test
# y_test_original = scaler_target.inverse_transform(y_test.reshape(-1, 1))

# Calculate the metrics
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE
r2 = r2_score(y_test, y_test_pred)

# Print out the metrics
print(f'Mean Absolute Error (MAE): {mae:.4f}')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'R-squared (R²): {r2:.4f}')


Mean Absolute Error (MAE): 13733.9590
Mean Squared Error (MSE): 846656640.0000
Root Mean Squared Error (RMSE): 29097.3652
R-squared (R²): 0.4309


  X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
