<a href="https://colab.research.google.com/github/saJeelakhan/CodeX_AI_Technical_Assessment_01/blob/main/python_assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install praw



In [None]:
import requests
from bs4 import BeautifulSoup
import praw
import pandas as pd
import re
import nltk
import random
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Utility Functions
def is_valid_greeklish(text):
    if len(text) < 3 or text in ["[deleted]", "[removed]"]:
        return False
    if re.search(r'[α-ωά-ώΑ-Ω]', text) or re.search(r'[а-яА-Я]', text):
        return False
    greeklish_words = {"kaneis", "einai", "thelw", "gia", "kai", "den", "ti", "sou", "mou", "ellinika", "greeklish", "etsi", "prepei", "pame", "mporei", "xerei", "thes", "opote"}
    words = set(re.findall(r'\b\w+\b', text.lower()))
    return len(words & greeklish_words) >= 2

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def split_into_sentences(paragraph):
    return [s.strip() for s in paragraph.replace('!', '.').replace('?', '.').split('.') if s.strip()]

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [None]:
# Greeklish Sources
def scrape_reddit_greeklish():
    print("Scraping Reddit Greeklish posts...")
    reddit = praw.Reddit(client_id='d8QlBknjt1u9nDgmmo1PXg', client_secret='HdaHhnFhFCkPk0O5FCN32CS2K-YWhA', user_agent='GreeklishScraper', check_for_async=False)
    posts = []
    subreddit = reddit.subreddit("greece")
    for submission in subreddit.search("greeklish", limit=300):
        if is_valid_greeklish(submission.title):
            posts.append(submission.title)
        if submission.comments:
            submission.comments.replace_more(limit=0)
            for comment in submission.comments.list():
                if is_valid_greeklish(comment.body):
                    posts.append(comment.body)
        if len(posts) >= 200:
            break
    print(f"Fetched {len(posts)} Greeklish posts from r/greece")
    return posts[:200]

def scrape_insomnia():
    print("Scraping Greeklish from Insomnia.gr...")
    url = 'https://www.insomnia.gr/forums/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    posts = soup.find_all('p')
    greeklish = [post.text for post in posts if is_valid_greeklish(post.text)]
    print(f"Fetched {len(greeklish)} Greeklish posts from Insomnia.gr")
    return greeklish[:100]

def scrape_youtube():
    print("Scraping Greeklish YouTube comments...")
    url = 'https://www.youtube.com/watch?v=_akH1Bns2B8'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    comments = soup.find_all('yt-formatted-string', class_='style-scope ytd-comment-renderer')
    greeklish = [comment.text for comment in comments if is_valid_greeklish(comment.text)]
    print(f"Fetched {len(greeklish)} Greeklish YouTube comments")
    return greeklish[:100]

In [None]:
# English Sources
def scrape_reddit_english():
    print("Scraping Reddit English posts...")
    reddit = praw.Reddit(client_id='d8QlBknjt1u9nDgmmo1PXg', client_secret='HdaHhnFhFCkPk0O5FCN32CS2K-YWhA', user_agent='EnglishScraper', check_for_async=False)
    english = []
    subreddit = reddit.subreddit("AskReddit")
    for submission in subreddit.top(limit=100):
        english.extend(nltk.sent_tokenize(submission.title))
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            if hasattr(comment, 'body'):
                english.extend(nltk.sent_tokenize(comment.body))
        if len(english) >= 300:
            break
    print(f"Fetched {len(english)} English sentences from r/AskReddit")
    return english[:300]

def scrape_wikipedia_sentences(url, min_sentences=150):
    print("Scraping English from Wikipedia...")
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    sentences = []
    for para in paragraphs:
        text = para.get_text()
        text = re.sub(r'\[\d+\]', '', text)
        sentences.extend(nltk.sent_tokenize(text))
        if len(sentences) >= min_sentences:
            break
    print(f"Fetched {len(sentences)} English Wikipedia sentences")
    return sentences[:min_sentences]

In [None]:
# Data Collection & Preprocessing
def collect_data():
    greeklish = scrape_reddit_greeklish() + scrape_insomnia() + scrape_youtube()
    english = scrape_reddit_english() + scrape_wikipedia_sentences("https://en.wikipedia.org/wiki/Natural_language_processing", 200)
    raw_df = pd.DataFrame({
        'text': greeklish + english,
        'label': ['Greeklish'] * len(greeklish) + ['English'] * len(english)
    })
    print("Splitting and cleaning sentences...")
    new_rows = []
    for _, row in raw_df.iterrows():
        for sentence in split_into_sentences(row['text']):
            cleaned = preprocess_text(sentence)
            if cleaned:
                new_rows.append({'sentence': cleaned, 'label': row['label']})

    df = pd.DataFrame(new_rows)
    df = df.sample(frac=1).reset_index(drop=True)
    df.to_csv("dataset.csv", index=False)
    print(f"Final dataset: {len(df)} rows")
    print(df['label'].value_counts())
    return df

In [None]:
# Model Training
def train_model(df):
    X = df['sentence']
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    print("\nModel Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

    os.makedirs('model', exist_ok=True)
    joblib.dump(model, 'model/greeklish_classifier.pkl')
    joblib.dump(vectorizer, 'model/tfidf_vectorizer.pkl')
    print("\nModel and vectorizer saved to the 'model' directory.")
    return model, vectorizer

In [None]:
# Prediction Function
def predict_text(text):
    model = joblib.load("model/greeklish_classifier.pkl")
    vectorizer = joblib.load("model/tfidf_vectorizer.pkl")
    processed_text = preprocess_text(text)
    prediction = model.predict(vectorizer.transform([processed_text]))[0]
    return prediction

In [None]:
# Main Script
def main():
    print("Starting data collection and training pipeline...\n")
    df = collect_data()
    print("\nTraining model...")
    model, vectorizer = train_model(df)
    print("\nTesting prediction examples:")
    print("ti kaneis ->", predict_text("ti kaneis"))  # Greeklish
    print("Hello, how are you? ->", predict_text("Hello, how are you?"))  # English
    print("\nDone.")

if __name__ == '__main__':
    main()

Starting data collection and training pipeline...

Scraping Reddit Greeklish posts...
Fetched 192 Greeklish posts from r/greece
Scraping Greeklish from Insomnia.gr...
Fetched 0 Greeklish posts from Insomnia.gr
Scraping Greeklish YouTube comments...
Fetched 0 Greeklish YouTube comments
Scraping Reddit English posts...
Fetched 918 English sentences from r/AskReddit
Scraping English from Wikipedia...
Fetched 38 English Wikipedia sentences
Splitting and cleaning sentences...
Final dataset: 885 rows
label
Greeklish    522
English      363
Name: count, dtype: int64

Training model...

Model Performance:
Accuracy: 0.9322
Precision: 0.9418
Recall: 0.9322
F1-Score: 0.9327

Model and vectorizer saved to the 'model' directory.

Testing prediction examples:
ti kaneis -> Greeklish
Hello, how are you? -> English

Done.
