In [30]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df = pd.read_csv('sentiAnalysis.csv')  # Replace with the actual path to your training set
# Map the Priority column
df['Priority'] = df['Priority'].map({'P1': 5, 'P2': 4, 'P3': 3, 'P4': 2, 'P5': 1})
df
df['Duration_hours']=df['Duration_hours']/24

In [14]:
df2 = df[df['Resolution'] == 'FIXED']

In [31]:
def remove_outliers(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    # Calculate IQR (Interquartile Range)
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter out the outliers
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    return df
df = remove_outliers(df, 'Duration_hours')

In [16]:
def SVMregressionModel(df, feature_cols, target_col, kernel='rbf', test_size=0.2, random_state=42):
    # Fill missing values in feature columns
    df = df.copy()
    X = df[feature_cols]
    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the SVM regression model with the specified kernel
    model = SVR(kernel=kernel)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    
    return mse, r2 

In [19]:
mse_rbf, r2_rbf = SVMregressionModel(df, ['Priority','Emotionality'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Emotionality'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Priority'], 'Duration_hours', kernel='rbf')

Mean Squared Error: 29685.201431027464
R^2 Score: -0.19918265305715388
Mean Squared Error: 30661.59666888933
R^2 Score: -0.2386257484490466
Mean Squared Error: 29204.06727499582
R^2 Score: -0.17974644559037944


In [32]:
mse_rbf, r2_rbf = SVMregressionModel(df, ['Priority','Emotionality'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Emotionality'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Priority'], 'Duration_hours', kernel='rbf')

Mean Squared Error: 4765.333223479596
R^2 Score: -0.20734177502789897
Mean Squared Error: 4956.6020167761935
R^2 Score: -0.2558015140590215
Mean Squared Error: 4730.344885145371
R^2 Score: -0.19847715202491956


In [22]:
mse_rbf, r2_rbf = SVMregressionModel(df, ['Pos_Score'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df, ['Neg_Score'], 'Duration_hours', kernel='rbf')

Mean Squared Error: 30646.306058530296
R^2 Score: -0.2380080590343454
Mean Squared Error: 30666.95161269306
R^2 Score: -0.23884207023256043


In [28]:
df

Unnamed: 0,Issue_id,Priority,Component,Duplicated_issue,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Duration_hours,TimeLabel,Pos_Score,Neg_Score,Emotion,Emotionality,Destiny
0,2,1,Team,,Opening repository resources doesnt honor type...,open repository resource always open default t...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2002-05-07 14:33:56+00:00,208.540972,long,0.020833,0.056818,negative,0.077652,Fixed
1,4,1,Team,,need better error message if catching up over ...,- become synchronize project repository ; - us...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2002-03-01 21:27:31+00:00,141.828472,long,0.049342,0.092105,negative,0.141447,Fixed
2,6,1,Team,,API - IResource.setLocal has problems (1G5TC8L),iresource.setlocal problems . method ( which r...,RESOLVED,INVALID,2.0,2001-10-11 01:34:00+00:00,2002-02-07 21:29:37+00:00,119.829861,long,0.042969,0.050781,negative,0.093750,Not Fixed
3,9,3,Team,,VCM Implementation - disallow root resource to...,implementation change root resource might pass...,RESOLVED,WONTFIX,2.0,2001-10-11 01:34:00+00:00,2001-10-24 03:39:17+00:00,13.086806,short,0.034091,0.056818,negative,0.090909,Not Fixed
4,11,3,Team,,API: ISharingManager::load mapping vcm project...,jean-michel ( 08/02/2001 1:38:48 pm ) ; ; isha...,RESOLVED,WONTFIX,2.0,2001-10-11 01:34:00+00:00,2001-10-24 03:39:22+00:00,13.086806,short,0.035937,0.018750,positive,0.054688,Not Fixed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57845,229777,3,UI,,[Viewers] Wrong argument in the first statemen...,build id : 3.3 ( i sure mean build id ; 3.3 ec...,RESOLVED,FIXED,3.3,2008-05-01 13:47:00+00:00,2008-05-10 14:06:07+00:00,9.013194,short,0.064024,0.027439,positive,0.091463,Fixed
57846,229779,3,UI,,NPE in performance tests,several npes within ui session test prevent ru...,VERIFIED,FIXED,3.4,2008-05-01 13:52:00+00:00,2008-05-20 14:12:14+00:00,19.013889,short,0.033333,0.033333,negative,0.066667,Fixed
57847,229782,3,UI,,Performance tests for ICU Collator,i20080501-0100 ; ; use collator ( see dependan...,VERIFIED,FIXED,3.4,2008-05-01 14:05:00+00:00,2009-06-01 18:25:12+00:00,396.180556,long,0.025000,0.000000,positive,0.025000,Fixed
57848,229789,3,UI,,[Examples] examples plugins create duplicate m...,create attachment 98318 ; screenshot ; ; i2008...,VERIFIED,FIXED,3.4,2008-05-01 15:02:00+00:00,2008-05-31 01:57:57+00:00,29.454861,short,0.092105,0.013158,positive,0.105263,Fixed


In [37]:
df3 = df[df['Emotion'] == 'negative']
df4 = df[df['Emotion'] == 'positive']
mse_rbf, r2_rbf = SVMregressionModel(df3, ['Emotionality'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df4, ['Emotionality'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df3, ['Neg_Score'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df4, ['Pos_Score'], 'Duration_hours', kernel='rbf')


Mean Squared Error: 4786.747988509045
R^2 Score: -0.25362744180831687
Mean Squared Error: 5080.906981779695
R^2 Score: -0.2546061336894512
Mean Squared Error: 4785.348822645439
R^2 Score: -0.2532610066572565
Mean Squared Error: 5080.1312645883545
R^2 Score: -0.25441458923690474


In [36]:
df['Score_Difference'] = df['Pos_Score'] - df['Neg_Score']
mse_rbf, r2_rbf = SVMregressionModel(df3, ['Score_Difference'], 'Duration_hours', kernel='rbf')
mse_rbf, r2_rbf = SVMregressionModel(df4, ['Score_Difference'], 'Duration_hours', kernel='rbf')

Mean Squared Error: 4783.2613612462
R^2 Score: -0.2527143100481841
Mean Squared Error: 5078.454990092302
R^2 Score: -0.25400067410875216


In [2]:
from sklearn.ensemble import RandomForestRegressor


def RandomForestRegressionModel(df, feature_cols, target_col, n_estimators=100, test_size=0.2, random_state=42):
    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')
    
    # Combine feature columns into a single column for vectorization
    df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Encode the combined text column
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['combined'])

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the Random Forest regression model with the specified number of estimators
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    
    return mse, r2


In [None]:
mse, r2 = RandomForestRegressionModel(df, ['Priority', 'Emotion'], 'Duration_hours')
mse, r2 = RandomForestRegressionModel(df, ['Priority', 'Emotion','Emotionality'], 'Duration_hours')
mse, r2 = RandomForestRegressionModel(df, ['Title'], 'Duration_hours')
mse, r2 = RandomForestRegressionModel(df, ['Description'], 'Duration_hours')

Mean Squared Error: 182939447.5262612
R^2 Score: 0.04728464078283767
Mean Squared Error: 199718104.13717604
R^2 Score: -0.04009555018420041


In [25]:
!pip install xgboost
import xgboost as xgb

def XGBoostRegressionModel(df, feature_cols, target_col, n_estimators=100, learning_rate=0.1, max_depth=5, test_size=0.2, random_state=42):
    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')

    X = df[feature_cols]

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the XGBoost regression model with the specified parameters
    model = xgb.XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    
    return mse, r2



In [26]:
mse, r2 = XGBoostRegressionModel(df, ['Priority', 'Emotion','Emotionality'], 'Duration_hours')
mse, r2 = XGBoostRegressionModel(df, ['Title'], 'Duration_hours')

Mean Squared Error: 183710268.50366926
R^2 Score: 0.043270345373543795


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Title: object

NameError: name 'df' is not defined

In [7]:
df

Unnamed: 0,Issue_id,Priority,Component,Duplicated_issue,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Duration_hours,TimeLabel,Pos_Score,Neg_Score,Emotion,Emotionality
1,2,1,Team,,Opening repository resources doesnt honor type...,open repository resource always open default t...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2002-05-07 14:33:56+00:00,5004.983333,short,0.020833,0.056818,0,0.077652
3,4,1,Team,,need better error message if catching up over ...,- become synchronize project repository ; - us...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2002-03-01 21:27:31+00:00,3403.883333,short,0.049342,0.092105,0,0.141447
12,13,3,Team,,CC Discussion: local versioning (1GAT3PL),would make sense ( the ? ) project version men...,VERIFIED,FIXED,2.0,2001-10-11 01:34:00+00:00,2001-10-29 21:51:09+00:00,452.283333,short,0.036232,0.020833,1,0.057065
15,16,1,Team,,auto-merge button (1GBBEBB),merge view ; id like auto-merge button semanti...,RESOLVED,FIXED,2.0,2001-10-11 01:35:00+00:00,2002-02-08 19:43:37+00:00,2898.133333,short,0.032143,0.028571,1,0.060714
16,17,1,Team,,look at gender change cases (1GBCX61),need look gender change case ; give path ; - w...,RESOLVED,FIXED,2.0,2001-10-11 01:35:00+00:00,2002-05-24 01:20:53+00:00,5399.750000,short,0.020833,0.044643,0,0.065476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68116,229765,3,UI,,[CommonNavigator] NPE opening .project file in...,i20080430 ; ; self host eclipse try open .proj...,VERIFIED,FIXED,3.4,2008-05-01 12:09:00+00:00,2009-04-28 18:07:48+00:00,8693.966667,long,0.037500,0.068750,0,0.106250
68118,229776,3,SWT,,SWT.TOGGLE style bit stops setRegion from work...,swt.jar i20080429-0100 build ; setregion still...,RESOLVED,FIXED,3.4,2008-05-01 13:32:00+00:00,2008-05-05 21:13:32+00:00,103.683333,short,0.058824,0.022059,1,0.080882
68119,229777,3,UI,,[Viewers] Wrong argument in the first statemen...,build id : 3.3 ( i sure mean build id ; 3.3 ec...,RESOLVED,FIXED,3.3,2008-05-01 13:47:00+00:00,2008-05-10 14:06:07+00:00,216.316667,short,0.064024,0.027439,1,0.091463
68120,229779,3,UI,,NPE in performance tests,several npes within ui session test prevent ru...,VERIFIED,FIXED,3.4,2008-05-01 13:52:00+00:00,2008-05-20 14:12:14+00:00,456.333333,short,0.033333,0.033333,0,0.066667
