In [57]:
import pandas as pd

In [58]:
import pandas as pd

# Read lines manually
rows = []
with open('train.csv', 'r', encoding='utf-8') as f:
    header = f.readline().strip().split(',')
    for line in f:
        line = line.strip()
        if line:  # skip empty lines
            parts = line.rsplit(',', 1)  # split only at the last comma
            rows.append(parts)

# Create a DataFrame
df = pd.DataFrame(rows, columns=header)

print(df.head())


                                               query           label
0  How do I implement a binary search tree in Pyt...    Code example
1  What's the difference between == and === in Ja...     Explanation
2  My code keeps throwing a NullPointerException,...      Error help
3    Can you explain Big O notation in simple terms?     Explanation
4  What are some best practices for writing clean...  Best practices


In [59]:
df = df.applymap(lambda x: x.replace('"', '') if isinstance(x, str) else x)


  df = df.applymap(lambda x: x.replace('"', '') if isinstance(x, str) else x)


In [60]:
df

Unnamed: 0,query,label
0,How do I implement a binary search tree in Pyt...,Code example
1,What's the difference between == and === in Ja...,Explanation
2,"My code keeps throwing a NullPointerException,...",Error help
3,Can you explain Big O notation in simple terms?,Explanation
4,What are some best practices for writing clean...,Best practices
...,...,...
758,Implementation of segment tree with lazy propa...,Code example
759,Null pointer in my graph implementation,Error help
760,Most efficient way to implement string operati...,Best practices
761,When to use different tree structures?,General Q&A


In [61]:
df['query'] = df['query'].str.lower()


In [81]:
df.to_csv('../dataset_for_classification.csv')

In [62]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['query'], df['label'], test_size=0.2, random_state=42)


In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [64]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vec, y_train)


In [65]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))


                precision    recall  f1-score   support

Best practices       0.84      1.00      0.91        16
  Code example       1.00      0.91      0.95        34
    Error help       1.00      0.85      0.92        26
   Explanation       0.89      0.89      0.89        28
      Feedback       0.89      0.96      0.92        25
   General Q&A       0.77      0.83      0.80        24

      accuracy                           0.90       153
     macro avg       0.90      0.91      0.90       153
  weighted avg       0.91      0.90      0.90       153



In [66]:
query = "How do I fix a NullPointerException?"

query_vec = vectorizer.transform([query])  # Note: input must be a list!


In [67]:
predicted_label = model.predict(query_vec)[0]
print(f"Predicted intent: {predicted_label}")


Predicted intent: Best practices


In [68]:
probs = model.predict_proba(query_vec)
confidence = max(probs[0])  # probability of the predicted class
print(f"Confidence: {confidence:.2f}")


Confidence: 0.37


In [69]:
df

Unnamed: 0,query,label
0,how do i implement a binary search tree in pyt...,Code example
1,what's the difference between == and === in ja...,Explanation
2,"my code keeps throwing a nullpointerexception,...",Error help
3,can you explain big o notation in simple terms?,Explanation
4,what are some best practices for writing clean...,Best practices
...,...,...
758,implementation of segment tree with lazy propa...,Code example
759,null pointer in my graph implementation,Error help
760,most efficient way to implement string operati...,Best practices
761,when to use different tree structures?,General Q&A


In [70]:
df[df['label'] == 'Best practices']


Unnamed: 0,query,label
4,what are some best practices for writing clean...,Best practices
6,how do i handle api rate limiting properly?,Best practices
11,how do i optimize database queries?,Best practices
17,how do i properly structure a react project?,Best practices
20,what's the best way to handle errors in python?,Best practices
...,...,...
736,most efficient way to implement string matching?,Best practices
742,best practices for implementing graph traversal?,Best practices
748,most efficient way to implement priority queue?,Best practices
754,best approach for implementing hash tables?,Best practices


In [79]:
df['label'].unique()

array(['Code example', 'Explanation', 'Error help', 'Best practices',
       'Feedback', 'General Q&A'], dtype=object)

In [78]:
# After you have trained model and vectorizer

# New user input
query = "can you explain big o notation in simple terms?"

# Preprocess
query_vec = vectorizer.transform([query])

# Predict
predicted_label = model.predict(query_vec)[0]
print(f"Predicted intent: {predicted_label}")

# Predict probability
probs = model.predict_proba(query_vec)
confidence = max(probs[0])
print(f"Confidence: {confidence:.2f}")


Predicted intent: Explanation
Confidence: 0.52
