In [13]:
import zipfile
import os
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('all')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report, recall_score
import warnings
from pandas.errors import SettingWithCopyWarning
import joblib
import tensorflow as tf
import os
import torch
from transformers import BertTokenizer, BertModel
import umap
import hdbscan
from sklearn.linear_model import LogisticRegression
from bertopic import BERTopic

uploaded_zip = '/home/abarovic/popeREU/eclipse_platform.zip'
extract_dir = '/home/abarovic/popeREU'
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)
with zipfile.ZipFile(uploaded_zip, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

csv_file = os.path.join(extract_dir, 'eclipse_platform.csv')
fulldata = pd.read_csv(csv_file)

print(fulldata.columns)
print(fulldata.isna().sum())  # Check NaNs in training features

def CreateDuration(dataset):
    dataset['Created_time'] = pd.to_datetime(dataset['Created_time'], utc=True)
    dataset['Resolved_time'] = pd.to_datetime(dataset['Resolved_time'], utc=True)

    # Convert to the desired format
    dataset['Created_time2'] = dataset['Created_time'].dt.strftime('%m/%d/%Y %H:%M')
    dataset['Resolved_time2'] = dataset['Resolved_time'].dt.strftime('%m/%d/%Y %H:%M')

    # Parse the formatted datetime strings back to datetime objects to ensure they are in the correct format
    dataset['Created_time2'] = pd.to_datetime(dataset['Created_time2'], format='%m/%d/%Y %H:%M')
    dataset['Resolved_time2'] = pd.to_datetime(dataset['Resolved_time2'], format='%m/%d/%Y %H:%M')

    # Calculate the duration in hours
    dataset['Duration'] = dataset['Resolved_time2'] - dataset['Created_time2']
    dataset['Duration_hours'] = dataset['Duration'].dt.total_seconds() / 3600

    # Drop the intermediate columns
    dataset.drop(['Created_time2', 'Resolved_time2', 'Duration'], axis=1, inplace=True)

CreateDuration(fulldata)

def NoOutliers(df):
    Q1 = df['Duration_hours'].quantile(0.25)
    Q3 = df['Duration_hours'].quantile(0.75)

    # Calculate the IQR
    IQR = Q3 - Q1

    # Define the lower and upper bounds for filtering outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out the outliers
    df = df[(df['Duration_hours'] >= lower_bound) & (df['Duration_hours'] <= upper_bound)]
    return(df)

#fulldata = NoOutliers(fulldata)

def CreateTimeLabel(df):
    threshold = df['Duration_hours'].quantile(0.70)
    # Create a new column TimeLabel based on the threshold
    df['TimeLabel'] = df['Duration_hours'].apply(lambda x: 'short' if x <= threshold else 'long')
    df = df[df['Resolution'] != 'MOVED']
    #df = df[df['Resolution'] == 'FIXED']
    return df

fulldata = CreateTimeLabel(fulldata)
def RemoveStopWords(dataset):
    #making all the items in the descriptions columns lower
    dataset['Description'] = dataset['Description'].str.lower()
    # Define stop words
    stop_words = set(stopwords.words('english'))
     # Remove stop words
    for index, row in dataset.iterrows():
        # Check if the 'Description' is not NaN
        if isinstance(row['Description'], str):
            words = row['Description'].split()
            final_tokens = [word for word in words if word not in stop_words]
            dataset.at[index, 'Description'] = ' '.join(final_tokens)
    return dataset

fulldata = RemoveStopWords(fulldata)

def Lemmitization(dataset):
    # Initialize the WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Function to lemmatize a sentence
    def lemmatize_sentence(sentence):
        words = nltk.word_tokenize(sentence)
        lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # Lemmatize verbs (default)
        return ' '.join(lemmatized_words)
    dataset['Description'] = dataset['Description'].astype(str)  # Convert all to strings
    # Apply lemmatization to each row in the 'Description' column
    dataset['Description'] = dataset['Description'].apply(lemmatize_sentence)

Lemmitization(fulldata)

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

# Function to calculate sentiment scores
def CalculateSentimentScores(description):
    tokens = word_tokenize(description)
    tagged_tokens = pos_tag(tokens)
        
    pos_score = 0
    neg_score = 0
    token_count = 0
    for word, tag in tagged_tokens:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            continue
        
        synsets = list(swn.senti_synsets(word, wn_tag))
        if not synsets:
            continue
        
        # Use the first synset for simplicity
        synset = synsets[0]
        pos_score += synset.pos_score()
        neg_score += synset.neg_score()
        token_count += 1
    
    # Normalize scores by the number of tokens
    if token_count > 0:
        pos_score /= token_count
        neg_score /= token_count
        return pos_score, neg_score

# Apply sentiment score calculation to each description
def CreatePosNegColumns(dataset): 
    dataset[['Pos_Score', 'Neg_Score']] = dataset['Description'].apply(lambda x: pd.Series(CalculateSentimentScores(x)))

CreatePosNegColumns(fulldata)

def EmotionColumn(dataset):
    #Creating a column for binary emotion (positive or negative)
    dataset['Emotion'] = dataset['Pos_Score'] - dataset['Neg_Score']
    
    # Assign labels based on the difference
    dataset['Emotion'] = dataset['Emotion'].apply(lambda x: 'positive' if x > 0 else 'negative')

EmotionColumn(fulldata)

def EmotionalityColumn(dataset):
    dataset['Emotionality'] = dataset['Pos_Score']+ dataset['Neg_Score']

EmotionalityColumn(fulldata)

def Destiny(df):
    # Create a new column 'Destiny' with default value 'Not Fixed'
    df['Destiny'] = 'Not Fixed'
    # Update 'Destiny' based on conditions in 'Resolution' column
    df.loc[df['Resolution'] == 'FIXED', 'Destiny'] = 'Fixed'
    return df

fulldata = Destiny(fulldata)
#print(fulldata['Emotion'].head(100))

def Emotion(df):
    # Fill NaN values with a default value (e.g., 'neutral') before mapping
    df['Emotion'] = df['Emotion'].fillna('neutral')
    # Label Positive emotion as 1, negative as 0, and neutral as -1
    df['Emotion'] = df['Emotion'].map({'positive': 1, 'negative': 0, 'neutral': -1})
    return df

fulldata = Emotion(fulldata)


def Priority(df):
    # Fill NaN values with a default value (e.g., 'P5') before mapping
    df['Priority'] = df['Priority'].fillna('P5')
    # Label Priority Columns numerically
    df['Priority'] = df['Priority'].map({'P1': 1, 'P2': 2, 'P3': 3, 'P4': 4, 'P5': 5})
    return df

fulldata = Priority(fulldata)

def DropDescriptionNA(df):
    df = df[df['Description'] != 'nan']

DropDescriptionNA(fulldata)

def Model(df):
    model = BERTopic(nr_topics=20)

    # Prepare the documents
    docs = df['Description'].fillna('').astype(str).tolist()

    # Fit the model to your data
    topics, probabilities = model.fit_transform(docs)
    #adding a column for predicted topics
    new_docs = df['Description'].fillna('').astype(str).to_list()
    topics, probs = model.transform(new_docs)
    df['Predicted_Topic'] = topics

    # Get topic information
    topic_info = model.get_topic_info()
    print(topic_info)
    # Get words associated with each topic
    topic_words = {topic: model.get_topic(topic) for topic in topic_info.Topic}
    print(topic_words)
    return df

fulldata = Model(fulldata)

print(fulldata.columns)

print(fulldata.columns)
print(fulldata['Emotionality'].head(100))

def SplitData(dataset):
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    trainset, testset = dataset[:train_size], dataset[train_size:]
    return trainset, testset
trainset, testset = SplitData(fulldata)
print("Total items in dataset:", fulldata.shape[0])
print("Total items in trainset:", trainset.shape[0])
print("Total items in testset:", testset.shape[0])

print(trainset.columns)
print(testset.columns)

print(trainset['Emotionality'].head(100))

print("-----------------\n Training Set: \n-----------------")
print(trainset.isna().sum())  # Check NaNs in training features
print("-----------------\n Test Set: \n-----------------")
print(testset.isna().sum())   # Check NaNs in test features


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/abarovic/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/abarovic/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/abarovic/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/abarovic/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/abarovic/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

Index(['Issue_id', 'Priority', 'Component', 'Duplicated_issue', 'Title',
       'Description', 'Status', 'Resolution', 'Version', 'Created_time',
       'Resolved_time'],
      dtype='object')
Issue_id                0
Priority                0
Component               0
Duplicated_issue    70752
Title                   0
Description           129
Status                  0
Resolution              0
Version                 0
Created_time            0
Resolved_time           0
dtype: int64


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topic  Count                                               Name  \
0      -1  42552                           -1_file_eclipse_use_line   
1       0  32502                              0_at_00000000_use_new   
2       1   4361                          1_cvs_project_file_folder   
3       2   1535                          2_search_undo_history_key   
4       3    901                              3_bug_clone_build_see   
5       4    791                         4_bind_object_return_value   
6       5    735                              5_note_w00t_dsgsg_den   
7       6    564                        6_console_print_output_0303   
8       7    378  7_gtk_fedorapeopleorggitweb_acommit_libclientd...   
9       8    231                     8_sort_order_column_comparator   
10      9    145                      9_string_public_int_partition   
11     10    130                                 10_nan_tycho_work_   
12     11     88                                  11_lt_html_tag_gt   
13    

CNN (Emotion, Emotionality, priority)

In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

trainset, testset = SplitData(fulldata)
# Define features and target
features = ['Emotion', 'Emotionality', 'Priority']
target = 'Resolution'

# Encode target labels into numerical format
label_encoder = LabelEncoder()
trainset[target] = label_encoder.fit_transform(trainset[target])
testset[target] = label_encoder.transform(testset[target])

# Handle missing values (Fill NaN with median value)
imputer = SimpleImputer(strategy='median')  # You can also use 'mean' or 'most_frequent'

# Apply imputer to features
X_train = imputer.fit_transform(trainset[features])
X_test = imputer.transform(testset[features])

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Target variable
y_train = trainset[target].dropna()  # Ensure no NaNs in target
y_test = testset[target].dropna()

# Define CNN Model
model = Sequential([
    Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification (short vs long)
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

# Evaluate model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 0.5745992601726264
              precision    recall  f1-score   support

   DUPLICATE       0.00      0.00      0.00      2406
       FIXED       0.57      1.00      0.73      9786
     INVALID       0.00      0.00      0.00      1069
  NDUPLICATE       0.00      0.00      0.00       966
 NOT_ECLIPSE       0.00      0.00      0.00       633
     WONTFIX       0.00      0.00      0.00       933
  WORKSFORME       0.00      0.00      0.00      1238

    accuracy                           0.57     17031
   macro avg       0.08      0.14      0.10     17031
weighted avg       0.33      0.57      0.42     17031



CNN (Emotion, Emotionality, priority, BERT)

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

trainset, testset = SplitData(fulldata)
# Define features and target
features = ['Emotion', 'Emotionality', 'Priority', 'Predicted_Topic']
target = 'Resolution'

# Encode target labels into numerical format
label_encoder = LabelEncoder()
trainset[target] = label_encoder.fit_transform(trainset[target])
testset[target] = label_encoder.transform(testset[target])

# Handle missing values (Fill NaN with median value)
imputer = SimpleImputer(strategy='median')  # You can also use 'mean' or 'most_frequent'

# Apply imputer to features
X_train = imputer.fit_transform(trainset[features])
X_test = imputer.transform(testset[features])

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Target variable
y_train = trainset[target].dropna()  # Ensure no NaNs in target
y_test = testset[target].dropna()

# Define CNN Model
model = Sequential([
    Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification (short vs long)
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

# Evaluate model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Epoch 1/20

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 0.5745992601726264
              precision    recall  f1-score   support

   DUPLICATE       0.00      0.00      0.00      2406
       FIXED       0.57      1.00      0.73      9786
     INVALID       0.00      0.00      0.00      1069
  NDUPLICATE       0.00      0.00      0.00       966
 NOT_ECLIPSE       0.00      0.00      0.00       633
     WONTFIX       0.00      0.00      0.00       933
  WORKSFORME       0.00      0.00      0.00      1238

    accuracy                           0.57     17031
   macro avg       0.08      0.14      0.10     17031
weighted avg       0.33      0.57      0.42     17031



CNN (Emotion, emotionality, priority) -> Weighted

In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from collections import Counter

# Assuming 'SplitData' and 'fulldata' are predefined and correctly implemented
trainset, testset = SplitData(fulldata)

# Define features and target
features = ['Emotion', 'Emotionality', 'Priority']
target = 'Resolution'

# Encode target labels into numerical format
label_encoder = LabelEncoder()
trainset[target] = label_encoder.fit_transform(trainset[target])
testset[target] = label_encoder.transform(testset[target])

# Handle missing values (Fill NaN with median value)
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(trainset[features])
X_test = imputer.transform(testset[features])

# Ensure target variable has no NaNs
y_train = trainset[target].dropna().astype(int)
y_test = testset[target].dropna().astype(int)

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
print("Resampled class distribution:", Counter(y_train))

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define CNN Model
model = Sequential([
    Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

# Model evaluation
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Resampled class distribution: Counter({1: 32628, 5: 32628, 2: 32628, 0: 32628, 6: 32628, 3: 32628, 4: 32628})
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 0.5745992601726264
              precision    recall  f1-score   support

   DUPLICATE       0.00      0.00      0.00      2406
       FIXED       0.57      1.00      0.73      9786
     INVALID       0.00      0.00      0.00      1069
  NDUPLICATE       0.00      0.00      0.00       966
 NOT_ECLIPSE       0.00      0.00      0.00       633
     WONTFIX       0.00      0.00      0.00       933
  WORKSFORME       0.00      0.00      0.00      1238

    accuracy                           0.57     17031
   macro avg       0.08      0.14      0.10     17031
weighted avg       0.33      0.57      0.42     17031



CNN (Emotion, emotionality, priority, BERT) -> Weighted

In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from collections import Counter

trainset, testset = SplitData(fulldata)
# Define features and target
features = ['Emotion', 'Emotionality', 'Priority', 'Predicted_Topic']
target = 'Resolution'

# Encode target labels into numerical format
label_encoder = LabelEncoder()
trainset[target] = label_encoder.fit_transform(trainset[target])
testset[target] = label_encoder.transform(testset[target])

# Handle missing values (Fill NaN with median value)
imputer = SimpleImputer(strategy='median')  # You can also use 'mean' or 'most_frequent'

# Apply imputer to features
X_train = imputer.fit_transform(trainset[features])
X_test = imputer.transform(testset[features])

# Target variable
y_train = trainset[target].dropna()  # Ensure no NaNs in target
y_test = testset[target].dropna()

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
print("Resampled class distribution:", Counter(y_train))

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Define CNN Model
model = Sequential([
    Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification (short vs long)
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

# Evaluate model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Resampled class distribution: Counter({1: 32628, 5: 32628, 2: 32628, 0: 32628, 6: 32628, 3: 32628, 4: 32628})
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 0.5745992601726264
              precision    recall  f1-score   support

   DUPLICATE       0.00      0.00      0.00      2406
       FIXED       0.57      1.00      0.73      9786
     INVALID       0.00      0.00      0.00      1069
  NDUPLICATE       0.00      0.00      0.00       966
 NOT_ECLIPSE       0.00      0.00      0.00       633
     WONTFIX       0.00      0.00      0.00       933
  WORKSFORME       0.00      0.00      0.00      1238

    accuracy                           0.57     17031
   macro avg       0.08      0.14      0.10     17031
weighted avg       0.33      0.57      0.42     17031

