## The input [dataset](https://www.kaggle.com/datasets/ilyaryabov/correctly-ordered-notebooks-for-google-ai4code) with correctly ordered notebooks for Google AI4Code competition

## The output [dataset](https://www.kaggle.com/datasets/ilyaryabov/fasttext-model-for-google-ai4code) with a pretrained models

## This notebook describes how to create a fasttext model for the Google AI4code competition

In [None]:
print("importing libraries...")

In [None]:
import numpy as np
import pandas as pd
import os
import re
import fasttext
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

In [None]:
src = '/kaggle/input/AI4Code/'
dataset_path = '/kaggle/input/correctly-ordered-notebooks-for-google-ai4code/dataset/dataset/'

In [None]:
stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()
        
        # remove digits
        document = re.sub(r'[0-9]+', '', document)

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if len(word) > 2]

        preprocessed_text = ' '.join(tokens)
        return preprocessed_text
    
    
def preprocess_df(df):
    return [preprocess_text(message) for message in df.source]


In [None]:
# There were experiments with 
# N = 10000 (7~% of dataset) and 
# N = 140000 (100% of dataset)
# They gave approximattly the same result

N = 10000

sourses = []
print('searching files...')
files = os.listdir(dataset_path)[:N]
print('reading files...')
dfs = [pd.read_csv(dataset_path + file) for file in tqdm(files)]
print('processing files...')
sourses = [preprocess_df(df) for df in tqdm(dfs)]

In [None]:
processed_data = "preprocessed_messages.txt"
print('creating txt file...')
with open(processed_data, "w") as fout:
    for notebook in tqdm(dfs):
        for i in range(len(notebook)):
            fout.write('{}\n'.format( preprocess_text(notebook.source[i])) )
            #print(notebook.source[i])
    fout.close()

In [None]:
print('teaching model...')
model = fasttext.train_unsupervised(input = processed_data)

In [None]:
print('saving model...')
model.save_model(f'model{N}.bin')
print('DONE')

In [None]:
from zipfile import ZipFile
ZipFile('model.zip', mode='w').write(f'model{N}.bin')

In [None]:
!rm preprocessed_messages.txt
!rm model*.bin

## To be continued here: [Model application](https://www.kaggle.com/ilyaryabov/fastttext-sorting-with-cosine-distance-algo)