In [5]:
import pandas as pd

# Load CSV (adjust delimiter if needed)
df = pd.read_csv("train.csv")

# Display first few rows
print(df.head())

   5 why doesn't an optical mouse work on a glass table?  \
0  6       What is the best off-road motorcycle trail ?    
1  3             What is Trans Fat? How to reduce that?    
2  7                         How many planes Fedex has?    
3  7  In the san francisco bay area, does it make se...    
4  5           What's the best way to clean a keyboard?    

                           or even on some surfaces?  \
0                  long-distance trail throughout CA   
1  I heard that tras fat is bad for the body.  Wh...   
2  I heard that it is the largest airline in the ...   
3  the prices of rent and the price of buying doe...   
4  I have very small stuff stuck under my keyboar...   

  Optical mice use an LED and a camera to rapidly capture images of the surface beneath the mouse.  The infomation from the camera is analyzed by a DSP (Digital Signal Processor) and used to detect imperfections in the underlying surface and determine motion. Some materials, such as glass, mirrors or 

In [6]:
# If the dataset has column names, replace with actual names
df.columns = ["Class", "Title", "Content", "Answer"]

# Combine title and content into a single text field
df["text"] = df["Title"].fillna('') + " " + df["Content"].fillna('')

# Keep only the relevant columns
df = df[["Class", "text"]]

# Convert class labels to zero-based indexing (optional)
df["Class"] = df["Class"] - 1  # Convert from 1-10 to 0-9

In [7]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from contractions import fix  # Install using: pip install contractions

# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")

# Load stopwords
stop_words = set(stopwords.words("english"))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Expand contractions (e.g., "don't" -> "do not")
    text = fix(text)
    
    # Lowercasing
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize, remove stopwords, and lemmatize
    text = " ".join(lemmatizer.lemmatize(word.strip()) for word in text.split() if word not in stop_words)
    
    return text

# Apply the function to the dataset
df["text"] = df["text"].apply(clean_text)

df.to_csv("preprocessed_train.csv", index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load preprocessed dataset
df = pd.read_csv("preprocessed_train.csv")

# Ensure columns exist
if "text" not in df.columns or "Class" not in df.columns:
    raise ValueError("Columns 'text' and 'Class' not found in the CSV file.")

# Drop rows with NaN values in both 'text' and 'Class' columns
df = df.dropna(subset=["text", "Class"])

# Extract features and labels
X = df["text"]  # Text data
y = df["Class"]  # Target labels

# Initialize the vectorizer after data cleaning
vectorizer = TfidfVectorizer(max_features=50000)

# Proceed with TF-IDF vectorization
X = vectorizer.fit_transform(X)

# Save the TF-IDF vectorizer model
joblib.dump(vectorizer, "tfidf_vectorizer_train.pkl")

['tfidf_vectorizer_train.pkl']