In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df = pd.read_csv('sentiAnalysis.csv')  # Replace with the actual path to your training set
df['Emotion'] = df['Emotion'].map({'positive': 1, 'negative': 0})

# Map the Priority column
df['Priority'] = df['Priority'].map({'P1': 5, 'P2': 4, 'P3': 3, 'P4': 2, 'P5': 1})

In [11]:
df

Unnamed: 0,Issue_id,Priority,Component,Duplicated_issue,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Duration_hours,TimeLabel,Pos_Score,Neg_Score,Emotion,Emotionality
0,1,3,Team,,Usability issue with external editors (1GE6IRL),- setup project contain * .gif resource ; - re...,CLOSED,FIXED,2.0,2001-10-11 01:34:00+00:00,2012-02-09 20:57:47+00:00,90571.383333,long,0.051768,0.065657,0,0.117424
1,2,5,Team,,Opening repository resources doesnt honor type...,open repository resource always open default t...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2002-05-07 14:33:56+00:00,5004.983333,short,0.020833,0.056818,0,0.077652
2,3,5,Team,,Sync does not indicate deletion (1GIEN83),km ( 10/2/2001 5:55:18 pm ) ; pr deletion indi...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2010-05-07 14:28:53+00:00,75132.900000,long,0.086364,0.036364,1,0.122727
3,4,5,Team,,need better error message if catching up over ...,- become synchronize project repository ; - us...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2002-03-01 21:27:31+00:00,3403.883333,short,0.049342,0.092105,0,0.141447
4,5,3,Team,,ISharingManager sharing API inconsistent (1GAU...,getting/setting manage state resource ; method...,RESOLVED,WONTFIX,2.0,2001-10-11 01:34:00+00:00,2008-08-15 12:04:36+00:00,60010.500000,long,0.085227,0.005682,1,0.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68119,229777,3,UI,,[Viewers] Wrong argument in the first statemen...,build id : 3.3 ( i sure mean build id ; 3.3 ec...,RESOLVED,FIXED,3.3,2008-05-01 13:47:00+00:00,2008-05-10 14:06:07+00:00,216.316667,short,0.064024,0.027439,1,0.091463
68120,229779,3,UI,,NPE in performance tests,several npes within ui session test prevent ru...,VERIFIED,FIXED,3.4,2008-05-01 13:52:00+00:00,2008-05-20 14:12:14+00:00,456.333333,short,0.033333,0.033333,0,0.066667
68121,229782,3,UI,,Performance tests for ICU Collator,i20080501-0100 ; ; use collator ( see dependan...,VERIFIED,FIXED,3.4,2008-05-01 14:05:00+00:00,2009-06-01 18:25:12+00:00,9508.333333,long,0.025000,0.000000,1,0.025000
68122,229789,3,UI,,[Examples] examples plugins create duplicate m...,create attachment 98318 ; screenshot ; ; i2008...,VERIFIED,FIXED,3.4,2008-05-01 15:02:00+00:00,2008-05-31 01:57:57+00:00,706.916667,short,0.092105,0.013158,1,0.105263


In [12]:
def remove_outliers(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    # Calculate IQR (Interquartile Range)
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter out the outliers
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    return df
df = remove_outliers(df, 'Duration_hours')

In [20]:
def SVMregressionModel(df, feature_cols, target_col, kernel='rbf', test_size=0.2, random_state=42):
    # Fill missing values in feature columns
    df = df.copy()
    X = df[feature_cols]
    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the SVM regression model with the specified kernel
    model = SVR(kernel=kernel)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    
    return mse, r2 

In [21]:
mse_rbf, r2_rbf = SVMregressionModel(df, ['Priority', 'Emotion'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Priority', 'Emotion','Emotionality'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Emotionality'], 'Duration_hours', kernel='rbf')

Mean Squared Error: 232207581.0583016
R^2 Score: -0.20929483494340717
Mean Squared Error: 232422727.57644933
R^2 Score: -0.21041527886675526
Mean Squared Error: 234166777.30200255
R^2 Score: -0.21949797252982917


In [None]:
mse_rbf, r2_rbf = SVMregressionModel(df, ['Title'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Description'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Emotion', 'Emotionality'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Title', 'Emotion'], 'Duration_hours', kernel='rbf')

In [22]:
print(df['combined'].iloc[0])

KeyError: 'combined'

In [9]:
print(df[['Emotion', 'Emotionality']].corr())

ValueError: could not convert string to float: 'negative'

In [2]:
from sklearn.ensemble import RandomForestRegressor


def RandomForestRegressionModel(df, feature_cols, target_col, n_estimators=100, test_size=0.2, random_state=42):
    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')
    
    # Combine feature columns into a single column for vectorization
    df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Encode the combined text column
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['combined'])

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the Random Forest regression model with the specified number of estimators
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    
    return mse, r2


In [None]:
mse, r2 = RandomForestRegressionModel(df, ['Priority', 'Emotion'], 'Duration_hours')
mse, r2 = RandomForestRegressionModel(df, ['Priority', 'Emotion','Emotionality'], 'Duration_hours')
mse, r2 = RandomForestRegressionModel(df, ['Title'], 'Duration_hours')
mse, r2 = RandomForestRegressionModel(df, ['Description'], 'Duration_hours')

Mean Squared Error: 182939447.5262612
R^2 Score: 0.04728464078283767
Mean Squared Error: 199718104.13717604
R^2 Score: -0.04009555018420041


In [25]:
!pip install xgboost
import xgboost as xgb

def XGBoostRegressionModel(df, feature_cols, target_col, n_estimators=100, learning_rate=0.1, max_depth=5, test_size=0.2, random_state=42):
    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')

    X = df[feature_cols]

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the XGBoost regression model with the specified parameters
    model = xgb.XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    
    return mse, r2



In [26]:
mse, r2 = XGBoostRegressionModel(df, ['Priority', 'Emotion','Emotionality'], 'Duration_hours')
mse, r2 = XGBoostRegressionModel(df, ['Title'], 'Duration_hours')

Mean Squared Error: 183710268.50366926
R^2 Score: 0.043270345373543795


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Title: object