# Imports

In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib


In [None]:

# Load dataset
df = pd.read_csv('./pullreqs.csv')


# Apply text normalization to 'Description' column
df['Description'] = df['Description'].apply(normalize_text)

# shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Preprocessing steps
# Split data into features (X) and target variable (y)
X = df['Description']
y = df['Class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)


# Create pipeline with TF-IDF vectorizer and logistic regression classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

# Fit the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))

# Load validation dataset
df_validation = pd.read_csv('./internal.csv')

# Apply the same text normalization to 'Description' column
df_validation['Description'] = df_validation['Description'].apply(normalize_text)

# Preprocessing steps for validation data
X_val = df_validation['Description']
y_val = df_validation['Class']

# Make predictions on validation data
y_val_pred = pipeline.predict(X_val)


# Convert y_val to string data type
y_val = y_val.astype(str)

# Evaluation on validation data
print(classification_report(y_val, y_val_pred))


In [4]:
# Text normalization function
def normalize_text(text):
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into text
    normalized_text = ' '.join(tokens)
    
    return normalized_text


In [None]:
# Save the model using joblib
joblib.dump(pipeline, 'my_model.pkl')  # Replace with your desired filename


In [8]:
# Load the saved model using joblib
loaded_pipeline = joblib.load('my_model.pkl')  

# Prediction on new text sample
txt = "Issue number: N/A What is the current behavior? As a takeaway from our learning session about a menuController bug in Ionic Angular, the team would like to update our other providers to use the same architecture as the menuController to prevent this kind of issue from happening again in the future. We also noticed that the common provider does not provide much value and it's easier to just have two separate implementations in src and standalone. (There wasn't much code we could de-duplicate) What is the new behavior? Removed the common modal provider in favor of separate ones in src/standalone We already have test coverage for the modalController, so I did not add new ones Does this introduce a breaking change? Yes No Other information." 

# Preprocess the new text following the same normalization steps
new_text = normalize_text(txt)

# Convert the new text into a list (single sample)
new_text_list = [new_text]

# Use the loaded pipeline to predict the class label
prediction = loaded_pipeline.predict(new_text_list)[0]

# Print the predicted class
print(f"Predicted class for the new text: {prediction}")


Predicted class for the new text: Merged


In [None]:
# normalize_text("I am a student, and I am studying at the University of Jordan")