In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [67]:
# Load your questions dataset and tags dataset from CSV files
try:
    questions_data = pd.read_csv(r'C:\Rajveer\AI\AI Project\AI Project\Data\Questions.csv', encoding='utf-8',nrows=500000)
except UnicodeDecodeError:
    questions_data = pd.read_csv(r'C:\Rajveer\AI\AI Project\AI Project\Data\Questions.csv', encoding='latin-1',nrows=500000)

try:
    tags_data = pd.read_csv(r'C:\Rajveer\AI\AI Project\AI Project\Data\Tags.csv', encoding='utf-8',nrows=500000)
except UnicodeDecodeError:
    tags_data = pd.read_csv(r'C:\Rajveer\AI\AI Project\AI Project\Data\Tags.csv', encoding='latin-1',nrows=500000)


In [68]:
questions_data

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...,...,...
499995,17419260,1406975.0,2013-07-02T06:55:20Z,,0,Objective C JSON object null but responseData ...,<p>I am sending an HTTP request to a web servi...
499996,17419270,2357697.0,2013-07-02T06:55:49Z,,0,Adding the .attr() to a variable - Jquery,<p>I am using Rickshaw framework which uses D3...
499997,17419290,2237469.0,2013-07-02T06:56:32Z,,2,Class cannot be resolved in jsp page,<p>I have a jsp page and a class. I am trying ...
499998,17419330,1239406.0,2013-07-02T06:58:43Z,,1,Replacing newlines in XML attributes with XSLT,<p>I need some XSLT (or <em>something</em> - s...


In [69]:
tags_data

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn
...,...,...
499995,6737260,design
499996,6737270,c#
499997,6737270,fuzzy-comparison
499998,6737310,c#


In [70]:
 
# Assuming there's a common attribute like 'question_id' in both datasets to merge on
merged_data = pd.merge(questions_data, tags_data, on='Id', how='inner')
merged_data

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,Tag
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex
1,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,actionscript-3
2,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,air
3,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn
4,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,tortoisesvn
...,...,...,...,...,...,...,...,...
499995,6737260,788580.0,2011-07-18T18:04:08Z,,2,Creating a thread for a database in Qt: a reas...,<p>the application I'm trying to design with Q...,design
499996,6737270,149358.0,2011-07-18T18:04:41Z,,2,Fuzzy Text Matching,<p>I need to attempt to match a given text to ...,c#
499997,6737270,149358.0,2011-07-18T18:04:41Z,,2,Fuzzy Text Matching,<p>I need to attempt to match a given text to ...,fuzzy-comparison
499998,6737310,277671.0,2011-07-18T18:07:44Z,,0,How to alert user of alarm finishing in Window...,<p>I have a timer app in the WP7 marketplace a...,c#


In [71]:
# Data preprocessing
merged_data['Body'] = merged_data['Body'].str.lower()  # Convert text to lowercase
merged_data

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,Tag
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>i've written a database generation script i...,flex
1,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>i've written a database generation script i...,actionscript-3
2,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>i've written a database generation script i...,air
3,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>are there any really good tutorials explain...,svn
4,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>are there any really good tutorials explain...,tortoisesvn
...,...,...,...,...,...,...,...,...
499995,6737260,788580.0,2011-07-18T18:04:08Z,,2,Creating a thread for a database in Qt: a reas...,<p>the application i'm trying to design with q...,design
499996,6737270,149358.0,2011-07-18T18:04:41Z,,2,Fuzzy Text Matching,<p>i need to attempt to match a given text to ...,c#
499997,6737270,149358.0,2011-07-18T18:04:41Z,,2,Fuzzy Text Matching,<p>i need to attempt to match a given text to ...,fuzzy-comparison
499998,6737310,277671.0,2011-07-18T18:07:44Z,,0,How to alert user of alarm finishing in Window...,<p>i have a timer app in the wp7 marketplace a...,c#


In [72]:
merged_data.dropna(inplace=True)

In [73]:
# Splitting data into train and test sets
X = merged_data['Body']
y = merged_data['Tag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [74]:

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [75]:

# Multi-label classification using Logistic Regression
model = LogisticRegression(solver='liblinear')
model.fit(X_train_tfidf, y_train)


                           precision    recall  f1-score   support

                     .net       0.04      0.06      0.05        54
                 .net-2.0       0.00      0.00      0.00         1
                 .net-3.5       0.00      0.00      0.00         2
                 .net-4.0       0.00      0.00      0.00         2
                       3d       0.00      0.00      0.00         1
                     abap       0.00      0.00      0.00         1
                     abbr       0.00      0.00      0.00         1
                   abcpdf       0.00      0.00      0.00         1
           access-control       0.00      0.00      0.00         1
            accessibility       0.00      0.00      0.00         1
                  acronym       0.00      0.00      0.00         1
             actionscript       0.00      0.00      0.00         2
           actionscript-3       0.00      0.00      0.00         1
                  activex       0.00      0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [86]:

# Example usage: Predict tags for a new question
def predict_tags(new_question):
    new_question_tfidf = tfidf_vectorizer.transform([new_question])
    predicted_tags = model.predict(new_question_tfidf)
    return predicted_tags

# Example usage:
new_question = "I am using an Enterprise version of Excel, so I cannot join the Beta to get the latest Excel feature with Python. I do have access to the Excel Labs Add In with the initial version of Python for Excel. When I try to use it, it won't recognize the function (PY) though, as it always results in a #NAME? error. What am I doing wrong? The IDE looks like everything works until I go to run it."
predicted_tags = predict_tags(new_question)
print(f"Predicted Tags: {predicted_tags}")


Predicted Tags: ['python']
