In [None]:
!pip install tensorflow

In [1]:

from tensorflow import keras
from tensorflow.keras import layers, models, callbacks



from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import mean_absolute_error

from tensorflow.keras.callbacks import EarlyStopping

In [3]:
def string_number(x):
    x = x.lower()
    new = ""
    for each in x :
        new = new + str( ord(each) )
    return new


def char_number(x):    
    return str( ord(x)  )



def getConvertedFormatofText(x):

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(x.values.astype('U'))
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    return tfidf_transformer


def getValidationData(xtest, ytest):
    xtest, xvalid, ytest, yvalid = train_test_split(xtest, ytest, test_size=0.50, random_state=42)
    return xtest, xvalid, ytest, yvalid


def getWordSplitData():
    data = pd.read_csv("word_split_replacement_encoded.tsv", sep="\t")
    data.columns = ["words", "output", "wordlen"]
    #     data = data.dropna()    
    x = data.words.map(char_number)
    x = x.values.astype('int32')
    y = data.output
    y = y.values.astype('int64')
    return x, y


def getWordSplitOHEData():
    data = pd.read_csv("word_splits_ohe.tsv", sep="\t")
#     data = data.dropna()
    l = [x for x in range(2, data.shape[1])]
    x = data.iloc[:,l]
    x = x.values.astype('int16')
    y = data.output
    y = y.values.astype('int64')
    return x, y


def getTrainTestData():        
    x, y = getWordSplitData()
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.40, random_state = 42)
    xtest, xvalid, ytest, yvalid = getValidationData(xtest, ytest)
    return  xtrain, xtest, xvalid, ytrain, ytest, yvalid


In [4]:

def getTheNetwork(num=6):
    i = 0
    list_dense = []    
    list_dense.append(layers.Dense(units= 100 * (num) , activation='relu', input_dim=1))
#     list_dense.append(layers.Dropout(0.3))
    for x in range(2,num):
        i = i+1
        list_dense.append(layers.Dense(units= 100 * ((num)-i), activation='relu' ))        
#         list_dense.append(layers.Dropout(0.3))
        
    
    list_dense.append(layers.Dense(units= 100, activation='relu' ))
    list_dense.append(layers.Dense(units=1))

    return  list_dense

In [5]:
model = keras.Sequential(getTheNetwork(6))

In [6]:


model.compile(optimizer='adam', loss = 'huber', metrics=['mse', 'mae', 'mape'])


early_stopping = EarlyStopping(
    min_delta=0.01,
    patience=40,
    restore_best_weights=True
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 600)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 500)               300500    
_________________________________________________________________
dense_2 (Dense)              (None, 400)               200400    
_________________________________________________________________
dense_3 (Dense)              (None, 300)               120300    
_________________________________________________________________
dense_4 (Dense)              (None, 200)               60200     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 1

In [7]:
xtrain, xtest, xvalid, ytrain, ytest, yvalid = getTrainTestData()

In [8]:
xtrain[0:5], ytrain[0:5]

(array([116, 111,  98, 107, 109], dtype=int32),
 array([2070, 5120, 2250, 6160, 9140]))

In [9]:
export_path_keras = "replacement_file_hacker.h5"
save_checkpoint = keras.callbacks.ModelCheckpoint(
    export_path_keras, monitor='loss', verbose=1, save_best_only=True,
    save_weights_only=False, mode='min', save_freq='epoch'
)

In [None]:

history = model.fit(
    xtrain, ytrain,
    validation_data=(xvalid, yvalid),
    batch_size=100,
    epochs=100,
    callbacks=[early_stopping, save_checkpoint],
    verbose=1
)


Epoch 1/100

Epoch 00001: loss improved from inf to 1191.03882, saving model to replacement_file_hacker.h5
Epoch 2/100

Epoch 00002: loss improved from 1191.03882 to 430.04779, saving model to replacement_file_hacker.h5
Epoch 3/100
 1555/18801 [=>............................] - ETA: 3:00 - loss: 450.1436 - mse: 1193953.8729 - mae: 450.6422 - mape: 17.1643

In [None]:
model = models.load_model('replacement_file_hacker.h5')
model.summary()

In [None]:
ypred = model.predict(xtest)

mean_absolute_error(ytest, ypred)

In [13]:
ypred = model.predict([[2070][5120]])

ypred

array([[109.263245]], dtype=float32)

In [None]:

history_df = pd.DataFrame(history.history)
history_df.loc[5:, ['loss', 'val_loss']].plot()
history_df.loc[5:, ['mae', 'val_mae']].plot()

print(history_df['val_loss'].min())
print(history_df['val_mae'].min())

In [3]:
#convert a word to single char cell

def writeRow(char, enc, wlen):
     with open("word_split_replacement_encoded.tsv", 'a') as decodeMsgWritter:
        decodeMsgWritter.write("\n{0}\t{1}\t{2}".format(char,enc,wlen))


def convertDatato_chars_encryption(x):
    row_x = x.words; row_y = x.output;
#     row_x = list(row_x.lower())
    for each in range(4,len(row_y)+4,4):
        if each-4 == 0:
            writeRow(row_x[int(each/4)-1],row_y[each-4:each], len(row_x) )            
        else:
            writeRow(row_x[int(each/4)-1],row_y[each-4:each], 0 )

    
    
    
data = pd.read_csv("title_encoded.tsv", sep="\t")
data.columns=["words","output"]
data = data.dropna()


data.apply(convertDatato_chars_encryption, axis = 1)
print('completed')

completed
