In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../processed_data/final_pre-processed.csv")

In [4]:
df = df.drop('Unnamed: 0',axis=1)[:4000]
df["Ratings"] = df["Ratings"].astype(int)

In [5]:
text_vectorizer = TfidfVectorizer(max_df=.8)
text_vectorizer.fit(df['reviewText'])
def rate(r):
    ary2 = []
    for rating in r:
        tv = [0,0,0,0,0]
        tv[rating-1] = 1
        ary2.append(tv)
    return np.array(ary2)

In [8]:
X = text_vectorizer.transform(df['reviewText']).toarray()
y = rate(df['Ratings'].values)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)

In [9]:
global_model = Sequential()
global_model.add(Dense(128,input_dim=X_train.shape[1]))
global_model.add(Dense(5,activation='softmax'))

In [10]:
global_model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [12]:
NUM_CLIENTS = 10
EPOCHS_PER_ROUND = 10
BATCH_SIZE = 32

In [14]:
x_train_clients = np.array_split(X_train,NUM_CLIENTS)
y_train_clients = np.array_split(y_train,NUM_CLIENTS)

x_test_clients = np.array_split(X_test,NUM_CLIENTS)
y_test_clients = np.array_split(y_test,NUM_CLIENTS)

In [24]:
client_models = []
for i in range(NUM_CLIENTS):
    local_model = Sequential()
    local_model.add(Dense(128,input_dim=X_train.shape[1]))
    local_model.add(Dense(5,activation='softmax'))
    local_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
    local_model.fit(x_train_clients[i],y_train_clients[i],epochs=EPOCHS_PER_ROUND, batch_size=BATCH_SIZE,verbose=0)
    acc = local_model.evaluate(x_test_clients[i],y_test_clients[i])
    print(acc)
    client_models.append(local_model)

[0.8231924176216125, 0.6875]
[0.9189920425415039, 0.675000011920929]
[0.8193017840385437, 0.7250000238418579]
[0.8563387989997864, 0.699999988079071]
[0.8214240074157715, 0.7124999761581421]
[0.8604847192764282, 0.7124999761581421]
[0.8079009056091309, 0.762499988079071]
[0.8607262372970581, 0.7250000238418579]
[0.9332224726676941, 0.675000011920929]
[0.9642958641052246, 0.699999988079071]


In [25]:
# Performing Federated Averaging
weights = global_model.get_weights()
for i in range(len(weights)):
    for j in range(NUM_CLIENTS):
        client_weights = client_models[j].get_weights()
        weights[i] += client_weights[i]/NUM_CLIENTS

global_model.set_weights(weights)

In [26]:
acc = global_model.evaluate(X_test,y_test)
print(acc)

[1.572872519493103, 0.6725000143051147]


In [27]:
# Train the Fed AVG model
fed_model = global_model
fed_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
fed_model.fit(X_train,y_train,epochs=10,batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x3b3bbf220>

In [28]:
acc = fed_model.evaluate(X_test,y_test)
print(acc)

[1.711401343345642, 0.6487500071525574]
