In [24]:
import pandas as pd

# Load CSV (adjust delimiter if needed)
df = pd.read_csv("train.csv")

# Display first few rows
print(df.head())

   5 why doesn't an optical mouse work on a glass table?  \
0  6       What is the best off-road motorcycle trail ?    
1  3             What is Trans Fat? How to reduce that?    
2  7                         How many planes Fedex has?    
3  7  In the san francisco bay area, does it make se...    
4  5           What's the best way to clean a keyboard?    

                           or even on some surfaces?  \
0                  long-distance trail throughout CA   
1  I heard that tras fat is bad for the body.  Wh...   
2  I heard that it is the largest airline in the ...   
3  the prices of rent and the price of buying doe...   
4  I have very small stuff stuck under my keyboar...   

  Optical mice use an LED and a camera to rapidly capture images of the surface beneath the mouse.  The infomation from the camera is analyzed by a DSP (Digital Signal Processor) and used to detect imperfections in the underlying surface and determine motion. Some materials, such as glass, mirrors or 

In [25]:
# If the dataset has column names, replace with actual names
df.columns = ["Class", "Title", "Content", "Answer"]

# Combine title and content into a single text field
df["text"] = df["Title"].fillna('') + " " + df["Content"].fillna('') + " " + df["Answer"].fillna('')

# Keep only the relevant columns
df = df[["Class", "text"]]

# Convert class labels to zero-based indexing (optional)
df["Class"] = df["Class"] - 1  # Convert from 1-10 to 0-9

df

Unnamed: 0,Class,text
0,5,What is the best off-road motorcycle trail ? l...
1,2,What is Trans Fat? How to reduce that? I heard...
2,6,How many planes Fedex has? I heard that it is ...
3,6,"In the san francisco bay area, does it make se..."
4,4,What's the best way to clean a keyboard? I hav...
...,...,...
1399994,2,do all these ads on tv of yoko etc regarding h...
1399995,6,Ways to sell your video games? Like if you wan...
1399996,2,is it normal to have nots in your breast or bo...
1399997,0,Who can speak Hindi?? If you can write it here...


In [26]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from contractions import fix  # Install using: pip install contractions

# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")

# Load stopwords
stop_words = set(stopwords.words("english"))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Ensure the text is not empty or just whitespace
    if not text or text.isspace():
        return ""  # Return an empty string if the input is invalid
    
    try:
        # Expand contractions (e.g., "don't" -> "do not")
        text = fix(text)
    except Exception as e:
        print(f"Error expanding contractions: {e} for text: {text}")
        return text  # Return original text if an error occurs
    
    # Lowercasing
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize, remove stopwords, and lemmatize
    text = " ".join(lemmatizer.lemmatize(word.strip()) for word in text.split() if word not in stop_words)
    
    return text

# Apply the function to the dataset
df["text"] = df["text"].apply(clean_text)

df.to_csv("preprocessed_train.csv", index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Error expanding contractions: string index out of range for text: Does anyone work for NATO İzmir/Turkey? Do u know the Human Resources Dept's info to send a resume?Thank you:) My name is Burcu,female,29.Single mom.I live in İzmir/Turkey.I used to live in U.S.A./Newyork, studied Business Administration at the CUNY and worked in an American Logistic company as a Foreign Trade and Freight Specialist at the same time. I also have 10 years of working experience about Export Business worked for several companies in Turkey as well.After my studies and also due to hard times in Newyork(about 9/11), I moved back to Turkey 2 years ago.I stayed in USA about 4 years. As my brother used to worked for NAVY/USA as a peddy officer, I knew the working conditions and benefits therefore I would like to work for NATO/İZMİR but I don't know how to apply for a job at NATO as a civilian. If anybody knows how to get info for Human Resources dept's postal addresss or tel/fax no's and contact person informatio

In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load preprocessed dataset
df = pd.read_csv("preprocessed_train.csv")

# Ensure columns exist
if "text" not in df.columns or "Class" not in df.columns:
    raise ValueError("Columns 'text' and 'Class' not found in the CSV file.")

# Drop rows with NaN values in both 'text' and 'Class' columns
df = df.dropna(subset=["text", "Class"])

# Extract features and labels
X = df["text"]  # Text data
y = df["Class"]  # Target labels

# Initialize the vectorizer after data cleaning
vectorizer = TfidfVectorizer(max_features=100000)

# Proceed with TF-IDF vectorization
X = vectorizer.fit_transform(X)

# Save the TF-IDF vectorizer model
joblib.dump(vectorizer, "tfidf_vectorizer_train.pkl")

['tfidf_vectorizer_train.pkl']

In [31]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

# Load the pre-trained TF-IDF vectorizers for train data
vectorizer_train = joblib.load("tfidf_vectorizer_train.pkl")

# Load the preprocessed training dataset
df_train = pd.read_csv("preprocessed_train.csv")

# Ensure the 'text' and 'Class' columns exist
if "text" not in df_train.columns or "Class" not in df_train.columns:
    raise ValueError("Columns 'text' and 'Class' not found in the train CSV file.")

# Drop rows with NaN values in both 'text' and 'Class' columns
df_train = df_train.dropna(subset=["text", "Class"])

# Extract features and labels for training data
X_train = df_train["text"]  # Train data text
y_train = df_train["Class"]  # Train data labels

# Split the training data into a training set and validation set (e.g., 90% train, 10% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=None)

# Transform the training and validation data using the corresponding vectorizer
X_train_tfidf = vectorizer_train.transform(X_train)
X_val_tfidf = vectorizer_train.transform(X_val)

# Initialize the model (e.g., Naive Bayes)
model = MultinomialNB()

# Train the model using the training data (X_train_tfidf and y_train)
model.fit(X_train_tfidf, y_train)

# Make predictions on the validation data
y_val_pred = model.predict(X_val_tfidf)

# Evaluate the model on the validation data
print("Validation Results:")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Results:
Accuracy: 0.7005900758668971
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.54      0.57     13964
           1       0.72      0.73      0.73     13950
           2       0.74      0.80      0.77     13927
           3       0.60      0.45      0.51     13975
           4       0.82      0.86      0.84     14143
           5       0.88      0.84      0.86     14096
           6       0.58      0.51      0.54     14237
           7       0.69      0.69      0.69     14073
           8       0.61      0.81      0.69     13928
           9       0.74      0.77      0.75     13689

    accuracy                           0.70    139982
   macro avg       0.70      0.70      0.70    139982
weighted avg       0.70      0.70      0.70    139982



In [33]:
from sklearn.linear_model import LogisticRegression

# Using Logistic Regression for text classification
model = LogisticRegression(max_iter=10000)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_val_tfidf)

# Evaluate the model
print("Model Evaluation Results:")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))

Model Evaluation Results:
Accuracy: 0.7005900758668971
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.54      0.57     13964
           1       0.72      0.73      0.73     13950
           2       0.74      0.80      0.77     13927
           3       0.60      0.45      0.51     13975
           4       0.82      0.86      0.84     14143
           5       0.88      0.84      0.86     14096
           6       0.58      0.51      0.54     14237
           7       0.69      0.69      0.69     14073
           8       0.61      0.81      0.69     13928
           9       0.74      0.77      0.75     13689

    accuracy                           0.70    139982
   macro avg       0.70      0.70      0.70    139982
weighted avg       0.70      0.70      0.70    139982

