In [222]:
import pandas as pd
import numpy as np
import glob 
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [223]:
# read all csv files in the folder Dataset
df = pd.concat([pd.read_csv(f) for f in glob.glob('Dataset/*.csv')], ignore_index = True)

In [224]:
df = df.drop(df.columns[0], axis=1)

In [225]:
df

Unnamed: 0,variance,similarity,delta_mod,size,nodes_num,nb_communities,modularity
0,0.000000e+00,1.000000,0.000000,10,69,4,0.503137
1,0.000000e+00,1.000000,0.000000,10,69,4,0.503137
2,6.595846e-07,0.992093,0.002707,10,69,4,0.505845
3,1.232595e-32,1.000000,0.000000,10,69,4,0.505845
4,2.012341e-06,0.991126,0.004729,10,69,4,0.510573
...,...,...,...,...,...,...,...
25386,7.888609e-31,1.000000,0.000000,100,19,4,0.486395
25387,7.888609e-31,1.000000,0.000000,100,19,4,0.486395
25388,7.888609e-31,1.000000,0.000000,100,19,4,0.486395
25389,7.888609e-31,1.000000,0.000000,100,19,4,0.486395


In [226]:
# get only 6 number after the comma in variance and similarity 
df['variance'] = df['variance'].apply(lambda x: round(x, 6))
df['similarity'] = df['similarity'].apply(lambda x: round(x, 6))


In [227]:
# row num
df.shape[0]

25391

In [228]:
# remove duplicate rows
df = df.drop_duplicates()

In [229]:
df.shape[0]

20887

In [230]:
# group by variance similarity and nodes num and nb_communities where delta_mod is max get the index of the max delta_mod
df_g = df.loc[df.groupby(['variance', 'similarity', 'nodes_num', 'nb_communities'])['delta_mod'].idxmax()]

In [231]:
df_g

Unnamed: 0,variance,similarity,delta_mod,size,nodes_num,nb_communities,modularity
3065,0.000000,0.129090,0.000988,100,45,2,0.000000
19858,0.000000,0.183162,0.000988,80,45,2,0.000000
17675,0.000000,0.183956,0.000988,70,45,2,0.000000
18761,0.000000,0.249932,0.001953,75,45,3,0.000000
12191,0.000000,0.252539,0.000965,50,45,3,-0.000988
...,...,...,...,...,...,...,...
20359,0.001185,0.884335,0.058172,95,17,8,0.164820
5623,0.001206,0.875532,0.147392,15,19,8,0.284580
2468,0.001574,0.919275,0.112188,100,17,9,0.012465
2469,0.001913,0.873288,0.094183,100,17,9,0.106648


In [232]:
# reset indexs
df_g = df_g.reset_index(drop=True)

In [233]:
# get coreelation matrix of the dataframe
df_g.corr()

Unnamed: 0,variance,similarity,delta_mod,size,nodes_num,nb_communities,modularity
variance,1.0,-0.137149,0.503398,0.003714,-0.080547,-0.065092,-0.024084
similarity,-0.137149,1.0,-0.532818,-0.092848,0.396171,0.3327,-0.018315
delta_mod,0.503398,-0.532818,1.0,0.006357,-0.328493,-0.264351,-0.020746
size,0.003714,-0.092848,0.006357,1.0,0.039396,0.07533,0.012791
nodes_num,-0.080547,0.396171,-0.328493,0.039396,1.0,0.85902,-0.284433
nb_communities,-0.065092,0.3327,-0.264351,0.07533,0.85902,1.0,-0.557405
modularity,-0.024084,-0.018315,-0.020746,0.012791,-0.284433,-0.557405,1.0


In [234]:
# split the data into input and output variables
X = df_g.drop("size", axis=1)
y = df_g["size"]

In [235]:
X

Unnamed: 0,variance,similarity,delta_mod,nodes_num,nb_communities,modularity
0,0.000000,0.129090,0.000988,45,2,0.000000
1,0.000000,0.183162,0.000988,45,2,0.000000
2,0.000000,0.183956,0.000988,45,2,0.000000
3,0.000000,0.249932,0.001953,45,3,0.000000
4,0.000000,0.252539,0.000965,45,3,-0.000988
...,...,...,...,...,...,...
19361,0.001185,0.884335,0.058172,17,8,0.164820
19362,0.001206,0.875532,0.147392,19,8,0.284580
19363,0.001574,0.919275,0.112188,17,9,0.012465
19364,0.001913,0.873288,0.094183,17,9,0.106648


In [236]:
X=np.array(X)

In [221]:
X

array([[0.00000000e+00, 1.29090000e-01, 9.87654321e-04, 4.50000000e+01,
        2.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.83162000e-01, 9.87654321e-04, 4.50000000e+01,
        2.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.83956000e-01, 9.87654321e-04, 4.50000000e+01,
        2.00000000e+00, 0.00000000e+00],
       ...,
       [1.57400000e-03, 9.19275000e-01, 1.12188366e-01, 1.70000000e+01,
        9.00000000e+00, 1.24653740e-02],
       [1.91300000e-03, 8.73288000e-01, 9.41828255e-02, 1.70000000e+01,
        9.00000000e+00, 1.06648199e-01],
       [2.06610000e-02, 8.18182000e-01, 5.00000000e-01, 2.00000000e+00,
        2.00000000e+00, 0.00000000e+00]])

In [237]:
# normalize the input variables
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [238]:
# save the scaler without feature names
import pickle
pickle.dump(scaler, open('scaler.pkl', 'wb'))


In [151]:
# y=pd.get_dummies(y)

In [239]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [240]:
modal = LinearRegression()
modal.fit(X_train, y_train)

In [241]:
# evaluate the model using R-squared
r_squared = modal.score(X_test, y_test)
print("R-squared:", r_squared)

R-squared: 0.0415441252611668


In [242]:
# save the model to disk
import pickle
filename = 'linearRegression_model.pkl'
pickle.dump(modal, open(filename, 'wb'))

In [193]:
# create NN model regression
model = Sequential()
model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))

In [194]:
model.compile(loss='mean_absolute_error', optimizer=Adam(lr=0.001), metrics=['mean_absolute_error'])

In [195]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_21 (Dense)            (None, 32)                224       
                                                                 
 dense_22 (Dense)            (None, 64)                2112      
                                                                 
 dense_23 (Dense)            (None, 16)                1040      
                                                                 
 dense_24 (Dense)            (None, 8)                 136       
                                                                 
 dense_25 (Dense)            (None, 1)                 9         
                                                                 
Total params: 3,521
Trainable params: 3,521
Non-trainable params: 0
_________________________________________________________________


In [197]:
# train the model
model.fit(X_train, y_train,validation_data=(X_test,y_test), epochs=100, batch_size=16, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1f93772c580>

In [199]:
# evaluate the model using R-squared
r_squared = model.evaluate(X_test, y_test)
print("R-squared:", r_squared)

R-squared: [20.30165672302246, 20.30165672302246]


In [113]:
X2 = df.drop("delta_mod", axis=1)
y2 = df["delta_mod"]

In [114]:
# split the data into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [116]:
modal2 = LinearRegression()
modal2.fit(X_train2, y_train2)

In [117]:
# make predictions on the test set and print the results
y_pred2 = modal2.predict(X_test2)
print("Predictions:", y_pred2)
print("Actual values:", y_test2.values)

Predictions: [0.00044632 0.00491586 0.00242556 ... 0.00035621 0.00329236 0.00887766]
Actual values: [0.00053022 0.00308277 0.00064959 ... 0.00052938 0.00216146 0.01211706]


In [118]:
# evaluate the model using R-squared
r_squared = modal2.score(X_test2, y_test2)
print("R-squared:", r_squared)

R-squared: 0.6889139034298117


In [None]:
# build 