In [26]:
import pandas as pd

# Load CSV (adjust delimiter if needed)
df = pd.read_csv("test.csv")

# Display first few rows
print(df.head())

   9                       What makes friendship click?  \
0  2                      Why does Zebras have stripes?   
1  4           What did the itsy bitsy sipder climb up?   
2  4  What is the difference between a Bachelors and...   
3  3                              Why do women get PMS?   
4  3  If your co-worker is guilty of unsanitary hygi...   

                      How does the spark keep going?  \
0  What is the purpose or those stripes? Who do t...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4                                                NaN   

  good communication is what does it.  Can you move beyond small talk and say what's really on your mind.  If you start doing this, my expereince is that potentially good friends will respond or shun you.  Then you know who the really good friends are.  
0  this provides camouflage - predator vision is ... 

In [27]:
# If the dataset has column names, replace with actual names
df.columns = ["Class", "Title", "Content", "Answer"]

# Combine title and content into a single text field
df["text"] = df["Title"].fillna('') + " " + df["Content"].fillna('') + " " + df["Answer"].fillna('')

# Keep only the relevant columns
df = df[["Class", "text"]]

# Convert class labels to zero-based indexing (optional)
df["Class"] = df["Class"] - 1  # Convert from 1-10 to 0-9

In [28]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from contractions import fix  # Install using: pip install contractions

# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")

# Load stopwords
stop_words = set(stopwords.words("english"))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Ensure the text is not empty or just whitespace
    if not text or text.isspace():
        return ""  # Return an empty string if the input is invalid
    
    try:
        # Expand contractions (e.g., "don't" -> "do not")
        text = fix(text)
    except Exception as e:
        print(f"Error expanding contractions: {e} for text: {text}")
        return text  # Return original text if an error occurs
    
    # Lowercasing
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize, remove stopwords, and lemmatize
    text = " ".join(lemmatizer.lemmatize(word.strip()) for word in text.split() if word not in stop_words)
    
    return text

# Apply the function to the dataset
df["text"] = df["text"].apply(clean_text)

df.to_csv("preprocessed_test.csv", index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load preprocessed dataset
df = pd.read_csv("preprocessed_test.csv")

# Ensure columns exist
if "text" not in df.columns or "Class" not in df.columns:
    raise ValueError("Columns 'text' and 'Class' not found in the CSV file.")

# Drop rows with NaN values in both 'text' and 'Class' columns
df = df.dropna(subset=["text", "Class"])

# Extract features and labels
X = df["text"]  # Text data
y = df["Class"]  # Target labels

# Initialize the vectorizer after data cleaning
vectorizer = TfidfVectorizer(max_features=100000)

# Proceed with TF-IDF vectorization
X = vectorizer.fit_transform(X)

# Save the TF-IDF vectorizer model
joblib.dump(vectorizer, "tfidf_vectorizer_test.pkl")

['tfidf_vectorizer_test.pkl']