In [1]:
import pandas as pd
import glob
import numpy as np

# Assuming all files are in the same directory and follow the same naming pattern
# Adjust the path and pattern as needed
file_pattern = 'F:/finetunining sample/Spring_JIRA_Bug_Dataset_new/Spring_JIRA_Bug_Dataset/*.csv'

# Get a list of all CSV files
all_files = glob.glob(file_pattern)

# Load and concatenate all files into a single DataFrame
df_list = [pd.read_csv(file) for file in all_files]
data = pd.concat(df_list, ignore_index=True)

# Check the shape of the concatenated DataFrame
print(f"Total records: {data.shape[0]}")


Total records: 22652


In [2]:
data['Description'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Description'].fillna('', inplace=True)


In [3]:
data[((data['Description'].isna()) | (data['Summary'].isna())) & (data['Priority'].notna())]

Unnamed: 0,Summary,Issue key,Issue id,Issue Type,Status,Project key,Project name,Project type,Project lead,Project description,...,Affects Version/s.24,Affects Version/s.25,Affects Version/s.26,Affects Version/s.27,Affects Version/s.28,Component/s.7,Component/s.8,Component/s.9,Component/s.10,Component/s.11


In [4]:
# Drop columns with 100% null values
data = data.dropna(axis=1, how='all')

In [5]:
#df = data[['Summary','Priority','Created','Resolved',"Description"]]

df = data[['Summary','Priority','Created','Resolved']]

In [6]:
#df['input_text'] = df['Summary'] + ' ' + df['Description']

df['input_text'] = df['Summary'] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['input_text'] = df['Summary']


In [7]:
#df = df.drop(['Summary','Description'],axis=1)


df = df.drop(['Summary'],axis=1)

In [8]:
priority_mapping = {
    'Critical': 'High',
    'Blocker': 'High',
    'Major': 'Medium',
    'Minor': 'Low',
    'Trivial': 'Low'
}

In [9]:
df['priority_binned'] = df['Priority'].map(priority_mapping)

In [10]:
df['priority_binned'].value_counts()

priority_binned
Low       10912
Medium     9405
High       1974
Name: count, dtype: int64

In [11]:
df

Unnamed: 0,Priority,Created,Resolved,input_text,priority_binned
0,Major,22/Oct/10 3:13 AM,27/Oct/10 1:48 AM,Webflow 2.2 problem when using MVC view resolver,Medium
1,Major,14/Oct/10 2:22 AM,27/Oct/10 1:03 AM,Regression in showing global messages only,Medium
2,Minor,28/Sep/10 2:42 PM,01/Oct/10 2:58 AM,FlowViewResponseStateManager should call deleg...,Low
3,Major,27/Sep/10 11:34 PM,28/Sep/10 9:35 AM,javax.faces.ViewState hidden field resolves to...,Medium
4,Major,27/Sep/10 11:28 PM,28/Sep/10 9:36 AM,FlowFacesContext.newInstance should take into ...,Medium
...,...,...,...,...,...
22647,Major,13/Aug/09 7:04 PM,04/Sep/09 4:07 AM,NPE when handling the empty namespace as a cus...,Medium
22648,Major,12/Aug/09 1:28 AM,08/Sep/09 3:51 AM,ConcurrentModificationException in SimpleJdbcC...,Medium
22649,Major,11/Aug/09 7:28 PM,28/Oct/09 6:09 AM,StatementCreatorUtils.setNull throws exception...,Medium
22650,Major,10/Aug/09 3:28 PM,06/Sep/09 11:48 PM,@Configurable BeanCurrentlyInCreationException...,Medium


In [12]:
df['bug_created_date'] = pd.to_datetime(df['Created'], format='%d/%b/%y %I:%M %p')
df['bug_resolved_date'] = pd.to_datetime(df['Resolved'], format='%d/%b/%y %I:%M %p')

# Calculate the difference in days between the created and resolved dates
df['resolution_time_days'] = (df['bug_resolved_date'] - df['bug_created_date']).dt.total_seconds() / (60 * 60 * 24)


In [13]:
df = df.drop(['Priority','bug_created_date','bug_resolved_date','Created','Resolved'],axis=1)

In [14]:
df

Unnamed: 0,input_text,priority_binned,resolution_time_days
0,Webflow 2.2 problem when using MVC view resolver,Medium,4.940972
1,Regression in showing global messages only,Medium,12.945139
2,FlowViewResponseStateManager should call deleg...,Low,2.511111
3,javax.faces.ViewState hidden field resolves to...,Medium,0.417361
4,FlowFacesContext.newInstance should take into ...,Medium,0.422222
...,...,...,...
22647,NPE when handling the empty namespace as a cus...,Medium,21.377083
22648,ConcurrentModificationException in SimpleJdbcC...,Medium,27.099306
22649,StatementCreatorUtils.setNull throws exception...,Medium,77.445139
22650,@Configurable BeanCurrentlyInCreationException...,Medium,27.347222


In [15]:
resolution_bins = [0, 3, 14, np.inf]  # Example: Short-Term (0-3 days), Medium-Term (4-14 days), Long-Term (>14 days)
resolution_labels = ['Short-Term', 'Medium-Term', 'Long-Term']
df['bug_resolution_time'] = pd.cut(df['resolution_time_days'], bins=resolution_bins, labels=resolution_labels)


In [16]:
df['bug_resolution_time'].value_counts()

bug_resolution_time
Short-Term     9118
Long-Term      8933
Medium-Term    4538
Name: count, dtype: int64

In [17]:
df = df.drop(['resolution_time_days'],axis=1)

In [18]:
df

Unnamed: 0,input_text,priority_binned,bug_resolution_time
0,Webflow 2.2 problem when using MVC view resolver,Medium,Medium-Term
1,Regression in showing global messages only,Medium,Medium-Term
2,FlowViewResponseStateManager should call deleg...,Low,Short-Term
3,javax.faces.ViewState hidden field resolves to...,Medium,Short-Term
4,FlowFacesContext.newInstance should take into ...,Medium,Short-Term
...,...,...,...
22647,NPE when handling the empty namespace as a cus...,Medium,Long-Term
22648,ConcurrentModificationException in SimpleJdbcC...,Medium,Long-Term
22649,StatementCreatorUtils.setNull throws exception...,Medium,Long-Term
22650,@Configurable BeanCurrentlyInCreationException...,Medium,Long-Term


In [19]:
#df = df.dropna(subset=['priority_binned'])
#df = df.dropna(subset=['bug_resolution_time'])

In [20]:
df['priority_binned'].value_counts()

priority_binned
Low       10912
Medium     9405
High       1974
Name: count, dtype: int64

In [21]:
df['priority_binned'] = df['priority_binned'].fillna("High")

In [22]:
df['bug_resolution_time'].value_counts()

bug_resolution_time
Short-Term     9118
Long-Term      8933
Medium-Term    4538
Name: count, dtype: int64

In [23]:
df['bug_resolution_time'] = df['bug_resolution_time'].fillna("Medium-Term")

In [24]:
df[df['input_text'].isna()]

Unnamed: 0,input_text,priority_binned,bug_resolution_time


In [25]:
df.to_csv("combined_dataset.csv",index=False)

In [26]:
df

Unnamed: 0,input_text,priority_binned,bug_resolution_time
0,Webflow 2.2 problem when using MVC view resolv...,Medium,Medium-Term
1,Regression in showing global messages only Glo...,Medium,Medium-Term
2,FlowViewResponseStateManager should call deleg...,Low,Short-Term
3,javax.faces.ViewState hidden field resolves to...,Medium,Short-Term
4,FlowFacesContext.newInstance should take into ...,Medium,Short-Term
...,...,...,...
22647,NPE when handling the empty namespace as a cus...,Medium,Long-Term
22648,ConcurrentModificationException in SimpleJdbcC...,Medium,Long-Term
22649,StatementCreatorUtils.setNull throws exception...,Medium,Long-Term
22650,@Configurable BeanCurrentlyInCreationException...,Medium,Long-Term


In [29]:
priority_distribution = df['priority_binned'].value_counts()
resolution_time_distribution = df['bug_resolution_time'].value_counts()

print("Priority Distribution:\n", priority_distribution)
print("\nResolution Time Distribution:\n", resolution_time_distribution)

Priority Distribution:
 priority_binned
Low       10912
Medium     9405
High       2335
Name: count, dtype: int64

Resolution Time Distribution:
 bug_resolution_time
Short-Term     9118
Long-Term      8933
Medium-Term    4601
Name: count, dtype: int64
