In [135]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import csv
from sklearn.ensemble import RandomForestClassifier

import pandas as pd

In [124]:
data = open('data/gen_data/cleaned_file_all.txt', 'r').read()
data

"a wounded deer leaps highest i've heard the hunter tell 't is but the ecstasy of death and then the brake is still the smitten rock that gushes the trampled steel that springs a cheek is always redder just where the hectic stings mirth is the mail of anguish in which it cautions arm lest anybody spy the blood and you're hurt exclaim one dignity delays for all one mitred afternoon none can avoid this purple none evade this crown coach it insures and footmen chamber and state and throng bells also in the village as we ride grand along what dignified attendants what service when we pause how loyally at parting their hundred hats they raise how pomp surpassing ermine when simple you and i present our meek escutcheon and claim the rank to die safe in their alabaster chambers untouched by morning and untouched by noon sleep the meek members of the resurrection rafter of satin and roof of stone light laughs the breeze in her castle of sunshine babbles the bee in a stolid ear pipe the sweet b

In [125]:
def create_dataset(data:str, window_size:int=2)-> None:
    with open("data/gen_data/test_data.csv", 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Previous_ch', "Next_char"])
        i = 0
        while i<len(data)-window_size-1:
            writer.writerow([data[i:i+window_size], data[i+window_size]])
            i+=1
    return

create_dataset(data)


In [126]:
df = pd.read_csv('data/gen_data/test_data.csv')
df

Unnamed: 0,Previous_ch,Next_char
0,a,w
1,w,o
2,wo,u
3,ou,n
4,un,d
...,...,...
16440,ke,n
16441,en,
16442,n,h
16443,h,e


In [127]:
all_chars = "".join(df["Previous_ch"]) + "".join(df['Next_char'])
unique_chars_set = list(set(all_chars))
ch_to_int = {}
i = 0
for ch in unique_chars_set:
    ch_to_int[ch] = i
    i+=1

int_to_ch = {}
for ch, int in ch_to_int.items():
    int_to_ch[int] = ch


In [128]:
df['target'] = df['Next_char'].apply(lambda x: ch_to_int[x])

def encode_row(prev_str):
    lis = []
    for ch in prev_str:
        lis.append(ch_to_int[ch])
    return lis



df.drop(columns=['Next_char'], inplace=True)

In [129]:
df['encoded_prev_ch'] = df['Previous_ch'].apply(encode_row)
df
df.drop(columns=['Previous_ch'])

Unnamed: 0,target,encoded_prev_ch
0,9,"[13, 23]"
1,10,"[23, 9]"
2,7,"[9, 10]"
3,1,"[10, 7]"
4,0,"[7, 1]"
...,...,...
16440,1,"[22, 8]"
16441,23,"[8, 1]"
16442,25,"[1, 23]"
16443,8,"[23, 25]"


In [130]:
X = pd.DataFrame(df['encoded_prev_ch'].tolist())
y = df['target'] 

y

0         9
1        10
2         7
3         1
4         0
         ..
16440     1
16441    23
16442    25
16443     8
16444    20
Name: target, Length: 16445, dtype: int64

In [131]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42
)


### Decision Trees

In [132]:
model = DecisionTreeClassifier(max_depth=20, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)


Test Accuracy: 0.3721495895408939


In [133]:
def gen_text(model, start_text, char_to_int, int_to_ch, length=50):
    output = start_text
    window_size = len(start_text)

    while len(output) < length:
        prev_start = output[-window_size:]
        X_inp = [[char_to_int[ch] for ch in prev_start]]
        output_label = model.predict(X_inp)
        output+=int_to_ch[output_label[0]]

    return output



In [134]:
gen_text(model, 'az', ch_to_int, int_to_ch, length=10)

'aze the th'

### Random Forest

In [136]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

y_pred = model_rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Random Forest Test Accuracy:", acc)

Random Forest Test Accuracy: 0.37549407114624506


In [137]:
gen_text(model_rf, 'az', ch_to_int, int_to_ch, length=10)

'aze the th'