# Introduction


In this notebook I am trying to build an Arabic based chatbot with a reddit flare "Dardesh", using Hugging face open GPT2. I'll be using a pre-trained arabic model and fine tune it. 

# Data Collection and Processing


I've collected data from 2 subreddits, Egypt and Arabs and formatted them into chat bot structure and split them into testing and training. 

I used pushshift io to crawl all sumbmissions in a sub reddit as reddit api only returns 1000 submissions per subreddit as max.

I followed these tutorials: 

https://www.storybench.org/how-to-scrape-reddit-with-python/ 

https://www.textjuicer.com/2019/07/crawling-all-submissions-from-a-subreddit/


I have mapped each subreddit title to all the comments, mimkcing question answer pairs. I did this for the Arabic and Egypt subreddits. 

I have also removed subreddits or comments that were entirely wrriteen in english. I was left with csv files for each subreddit. Each csv file had a coloumn for the title, comment, date and link. 

In [None]:
!pip install praw

In [None]:
!pip install langdetect

In [None]:

import praw
import pandas as pd
import datetime as dt
from praw.models import MoreComments
from langdetect import detect
import pickle

In [None]:
import requests

url = "https://api.pushshift.io/reddit/search/submission"

def crawl_page(subreddit: str, last_page = None):
    """
    Crawl a page of results from a given subreddit.

    :param subreddit: The subreddit to crawl.
    :param last_page: The last downloaded page.

    :return: A page or results.
    """
    params = {"subreddit": subreddit, "size": 500, "sort": "desc", "sort_type": "created_utc"}
    if last_page is not None:
        if len(last_page) > 0:
            # resume from where we left at the last page
            params["before"] = last_page[-1]["created_utc"]
        else:
            # the last page was empty, we are past the last page
            return []
    results = requests.get(url, params)
    if not results.ok:
        # something wrong happened
        raise Exception("Server returned status code {}".format(results.status_code))
    return results.json()["data"]

In [None]:
import time

def crawl_subreddit(subreddit, max_submissions=2000):
    """
    Crawl submissions from a subreddit.
    :param subreddit: The subreddit to crawl.
    :param max_submissions: The maximum number of submissions to download.
    :return: A list of submissions.
    """
    submissions = []
    last_page = None
    while last_page != [] and len(submissions) < max_submissions:
        last_page = crawl_page(subreddit, last_page)
        submissions += last_page
        time.sleep(3)
    return submissions[:max_submissions]

In [None]:
def subredditArabic(subredditName,limit):
    lastest_submissions = crawl_subreddit(subredditName,limit)
    topics_data=praw_submissions_comments(lastest_submissions,subredditName,limit)
    return topics_data, lastest_submissions

In [None]:
def praw_submissions_comments(lastest_submissions,subredditName,limit_sub):
    reddit = praw.Reddit(client_id='client', 
                     client_secret='secret', 
                     user_agent='user', 
                     username='username', 
                     password='dardesh')
    subreddit = reddit.subreddit(subredditName)
    top_subreddit = subreddit.top(limit=limit_sub)
    topics_dict = { "title":[], 
                    "id":[], 
                    "url":[],
                    "comment":[]
                }
    for sub in lastest_submissions:
        submission = reddit.submission(id=sub["id"])
        try:
            if(detect(submission.title)=='ar'):
                for top_level_comment in submission.comments:
                    if isinstance(top_level_comment, MoreComments, ):
                        continue
                    try:
                        if(detect(top_level_comment.body)=='ar'):
                            topics_dict["title"].append(submission.title)
                            topics_dict["id"].append(submission.id)
                            topics_dict["url"].append(submission.url)
                            topics_dict["comment"].append(top_level_comment.body)
                    except:
                        pass
        except:
            pass
    topics_data = pd.DataFrame(topics_dict)
    topics_data.to_pickle(subredditName) 
    topics_data.to_csv(r'Egypt_subreddit.csv', index = False)
    return topics_data

# Text Cleaning and Formatting

First I shuffled the csv file. Then I did some cleaning and removed links and unknown characters.Then Formated the text to be in chatbot format while splitting the file into training and testing. 


In [None]:
import csv
import pandas as pd 

def process_reddits(file_train,file_test,subredditFile):
    
    df = pd.read_csv(subredditFile)
    #shuffling the file
    print(df.shape[0])
    df = df.sample(frac=1)
    print(df.shape[0])
    train_row=round(df.shape[0]*0.8)
    counter=0
    for index, row in df.iterrows():
        
        q = '[انت] : ' + row['title']
        a = '[دردش] : ' + clean_comment(row['comment'])
        if(counter<train_row):
            file_train.write(q)
            file_train.write('\n')
            file_train.write(a)
            file_train.write('\n')
        else:
            file_test.write(q)
            file_test.write('\n')
            file_test.write(a)
            file_test.write('\n')
        counter+=1
            

In [None]:
import re


def clean_comment(original_text):
    original_text = re.sub(r'http\S+','', original_text)
    original_text = original_text.replace("[", "") 
    original_text = original_text.replace("]", "") 
    original_text = original_text.replace("{", "") 
    original_text = original_text.replace("}", "") 
    original_text = original_text.replace("(", "") 
    original_text = original_text.replace(")", "") 
    original_text = original_text.replace("�", "") 
    original_text = original_text.replace("", "") 
    original_text = original_text.replace("**", "")
    original_text = original_text.replace("##", "") 
    original_text = original_text.replace("&#x200B;", "") 
    original_text = original_text.replace("\u202c", "") 


    return original_text

I used the above method to create dardesh_train_ar files and dardesh_train_ar_eg files. Where the ar files consits of both subreddits and the eg consits of only Egypt subreddit. 


# Model Training


I have used these tutorials as a refrence and starting point for training my model.

https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface

https://colab.research.google.com/drive/1Bz-P-ucyLMaCBmgTjS_QR8RoGsZ5WHwo?usp=sharing


I have used a pre-trained arabic gpt2 model, developed by Wessam Antoun and AUC Brain Lab.

https://github.com/aub-mind/arabert/tree/master/examples

In [None]:
!pip install transformers==4.2.1
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

In [None]:
import importlib, pkg_resources, tokenizers
importlib.reload(pkg_resources)
importlib.reload(tokenizers)

In [None]:
#textwrap enables formating of long text
import textwrap

from transformers import pipeline, GPT2TokenizerFast
from arabert.aragpt2.grover.modeling_gpt2 import GPT2LMHeadModel
from arabert.preprocess import ArabertPreprocessor

#you can choose any aragpt2 model since they all have the same preprocessing

arabert_processor = ArabertPreprocessor(model_name="aragpt2-base")

In [None]:
!nvidia-smi

In [None]:
import torch
device = 0 if torch.cuda.is_available() else -1
print(device)

In [None]:
model_name = "aubmindlab/aragpt2-base"

aragpt2_pipeline = pipeline("text-generation",model=model_name,device=device)

In [None]:
!cp -r '../input/arabicreddit/dardesh_train_ar_eg.txt' ./
!cp -r '../input/arabicreddit/dardesh_test_ar_eg.txt' ./

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")

train_path = './dardesh_train_ar_eg.txt'
test_path = './dardesh_test_ar_eg.txt'

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("aubmindlab/aragpt2-base")

training_args = TrainingArguments(
    output_dir="./trained_model", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset)

In [None]:
trainer.train()

In [None]:
trainer.save_model('./trained_model')

In [None]:
!zip -r file.zip ./trained_model

<a href="./file.zip"> Download File </a>


In [None]:
from transformers import pipeline

bot = pipeline('text-generation',model='./trained_model', tokenizer='aubmindlab/aragpt2-base',config={'max_length':35})


# Interacting with the model

In [None]:

while True:
    ques = input("Question : ")

    inp = '[انت] : '+ques+'\n'+'[دردش] : '

    result = bot(inp)[0]['generated_text']

    print(result)

