In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import pandas as pd
import plotly.graph_objs as go


data = pd.read_csv("/content/drive/MyDrive/NER_CS/Full_Labeled_Corpus.csv")

In [None]:
data.head(5)

Unnamed: 0,words,labels,sentence_id
0,A,O,0
1,remote,B-relevant_term,0
2,code,B-relevant_term,0
3,execution,O,0
4,vulnerability,O,0


In [None]:
data.shape

(814114, 3)

In [None]:
data.rename(columns = {"words":'tokens', "labels":"tags"}, inplace = True)

In [None]:
data.drop('sentence_id', axis=1, inplace=True)
data.head()

Unnamed: 0,tokens,tags
0,A,O
1,remote,B-relevant_term
2,code,B-relevant_term
3,execution,O
4,vulnerability,O


In [None]:
data.isnull().sum()

tokens    1249
tags         0
dtype: int64

In [None]:
data.dropna(inplace = True)

In [None]:
data.isnull().sum()

tokens    0
tags      0
dtype: int64

In [None]:
# Count occurrences of each class in the target column

class_counts = data['tags'].value_counts()
print(class_counts)
# Create Plotly bar chart
fig = go.Figure(data=[go.Bar(
    x=class_counts.index,  # x-axis labels (target classes)
    y=class_counts.values,  # y-axis values (counts)
    marker_color='skyblue'  # Bar color
)])

# Customize layout
fig.update_layout(
    title='Distribution of Target Classes',
    xaxis=dict(title='Target Class'),
    yaxis=dict(title='Count'),
    showlegend=False,
    plot_bgcolor='rgba(0,0,0,0)',
    width=800,  # Set width of the figure
    height=600  # Set height of the figure
)

# Show plot
fig.show()


O                         563157
B-relevant_term            77114
I-relevant_term            50051
B-version                  29880
I-version                  25321
B-application              20525
I-application              13191
B-vendor                   11433
B-update                    4260
B-os                        4244
B-file                      3222
B-cve id                    3158
I-os                        2778
B-function                  1468
B-parameter                  657
B-edition                    595
B-hardware                   587
I-hardware                   586
I-update                     204
B-method                     175
B-programming language       168
I-vendor                      55
I-edition                     29
B-language                     7
Name: tags, dtype: int64


In [None]:
# Remove duplicates, keeping the first occurrence
data = data.drop_duplicates(keep='first')

print("DataFrame after removing duplicates, keeping the first occurrence:")
data.head()


DataFrame after removing duplicates, keeping the first occurrence:


Unnamed: 0,tokens,tags
0,A,O
1,remote,B-relevant_term
2,code,B-relevant_term
3,execution,O
4,vulnerability,O


In [None]:
# Count occurrences of each class in the target column

class_counts = data['tags'].value_counts()
print(class_counts)
# Create Plotly bar chart
fig = go.Figure(data=[go.Bar(
    x=class_counts.index,  # x-axis labels (target classes)
    y=class_counts.values,  # y-axis values (counts)
    marker_color='skyblue'  # Bar color
)])

# Customize layout
fig.update_layout(
    title='Distribution of Target Classes',
    xaxis=dict(title='Target Class'),
    yaxis=dict(title='Count'),
    showlegend=False,
    plot_bgcolor='rgba(0,0,0,0)',
    width=800,  # Set width of the figure
    height=600  # Set height of the figure
)

# Show plot
fig.show()


O                         21705
I-version                  3571
B-application              3405
B-version                  3242
B-file                     2722
B-function                 1371
I-application              1247
B-cve id                   1196
B-vendor                    650
B-parameter                 563
B-update                    260
I-hardware                  224
B-hardware                  220
B-method                    157
B-relevant_term             151
B-edition                    78
I-relevant_term              70
B-os                         53
I-os                         35
I-vendor                     34
I-update                     32
I-edition                    17
B-language                    4
B-programming language        1
Name: tags, dtype: int64


In [None]:
import pandas as pd

# List of programming languages
programming_languages = [
    'Python','C','R','SQL','GO', 'Java', 'C++', 'JavaScript', 'Ruby', 'Swift', 'Rust', 'Go', 'PHP', 'Perl'
]

# Creating DataFrame
df = {
    'tokens': programming_languages,
    'tags': ['B-PL'] * len(programming_languages)
}

df = pd.DataFrame(df)

print(df)


        tokens  tags
0       Python  B-PL
1            C  B-PL
2            R  B-PL
3          SQL  B-PL
4           GO  B-PL
5         Java  B-PL
6          C++  B-PL
7   JavaScript  B-PL
8         Ruby  B-PL
9        Swift  B-PL
10        Rust  B-PL
11          Go  B-PL
12         PHP  B-PL
13        Perl  B-PL


In [None]:
# Vertically concatenate the DataFrames
data = pd.concat([data, df], axis=0)

# Reset index if needed
data.reset_index(drop=True, inplace=True)

# Display the merged DataFrame
data.head()

Unnamed: 0,tokens,tags
0,A,O
1,remote,B-relevant_term
2,code,B-relevant_term
3,execution,O
4,vulnerability,O


In [None]:
data.duplicated().sum()
# 814114

0

In [None]:
data.tags.value_counts()

O                         21705
I-version                  3571
B-application              3405
B-version                  3242
B-file                     2722
B-function                 1371
I-application              1247
B-cve id                   1196
B-vendor                    650
B-parameter                 563
B-update                    260
I-hardware                  224
B-hardware                  220
B-method                    157
B-relevant_term             151
B-edition                    78
I-relevant_term              70
B-os                         53
I-os                         35
I-vendor                     34
I-update                     32
I-edition                    17
B-PL                         14
B-language                    4
B-programming language        1
Name: tags, dtype: int64

In [None]:
data = data[data['tags'] != 'B-programming language']

In [None]:
# Shuffle the DataFrame
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data.reset_index(drop=True, inplace=True)

# Display the first few rows of the shuffled DataFrame
data.head()

Unnamed: 0,tokens,tags
0,ORF,O
1,CallHomeExec,O
2,Mediator,O
3,aspnet_client,O
4,MC_EXT_SELECTED,O


In [None]:
# Count occurrences of each class in the target column

class_counts = data['tags'].value_counts()
print(class_counts)
# Create Plotly bar chart
fig = go.Figure(data=[go.Bar(
    x=class_counts.index,  # x-axis labels (target classes)
    y=class_counts.values,  # y-axis values (counts)
    marker_color='skyblue'  # Bar color
)])

# Customize layout
fig.update_layout(
    title='Distribution of Target Classes',
    xaxis=dict(title='Target Class'),
    yaxis=dict(title='Count'),
    showlegend=False,
    plot_bgcolor='rgba(0,0,0,0)',
    width=800,  # Set width of the figure
    height=600  # Set height of the figure
)

# Show plot
fig.show()


O                  21705
I-version           3571
B-application       3405
B-version           3242
B-file              2722
B-function          1371
I-application       1247
B-cve id            1196
B-vendor             650
B-parameter          563
B-update             260
I-hardware           224
B-hardware           220
B-method             157
B-relevant_term      151
B-edition             78
I-relevant_term       70
B-os                  53
I-os                  35
I-vendor              34
I-update              32
I-edition             17
B-PL                  14
B-language             4
Name: tags, dtype: int64


In [None]:
import pandas as pd

# Assuming your dataframe is named df and the class column is named 'class_column'

# Step 1: Identify the class for which you want to delete data
class_to_delete = 'O'

# Step 2: Get the indices of rows belonging to the class to delete
indices_to_delete = data[data['tags'] == class_to_delete].index

# Step 3: Randomly sample 20% of the data from that class to keep
num_to_keep = int(0.3 * len(indices_to_delete))
indices_to_keep = data[data['tags'] == class_to_delete].sample(n=num_to_keep, random_state=42).index

# Step 4: Remove the remaining 80% of the data from that class
indices_to_remove = indices_to_delete.difference(indices_to_keep)
data = data.drop(indices_to_remove)

# Now, df contains 80% of the data from the class to delete


In [None]:
# Count occurrences of each class in the target column

class_counts = data['tags'].value_counts()
print(class_counts)
# Create Plotly bar chart
fig = go.Figure(data=[go.Bar(
    x=class_counts.index,  # x-axis labels (target classes)
    y=class_counts.values,  # y-axis values (counts)
    marker_color='skyblue'  # Bar color
)])

# Customize layout
fig.update_layout(
    title='Distribution of Target Classes',
    xaxis=dict(title='Target Class'),
    yaxis=dict(title='Count'),
    showlegend=False,
    plot_bgcolor='rgba(0,0,0,0)',
    width=800,  # Set width of the figure
    height=600  # Set height of the figure
)

# Show plot
fig.show()


O                  6511
I-version          3571
B-application      3405
B-version          3242
B-file             2722
B-function         1371
I-application      1247
B-cve id           1196
B-vendor            650
B-parameter         563
B-update            260
I-hardware          224
B-hardware          220
B-method            157
B-relevant_term     151
B-edition            78
I-relevant_term      70
B-os                 53
I-os                 35
I-vendor             34
I-update             32
I-edition            17
B-PL                 14
B-language            4
Name: tags, dtype: int64


In [None]:
# Grouping into chunks of 5 rows and aggregating values into lists
data = data.groupby(data.index // 8).agg(lambda x: x.tolist())

# Displaying the resulting DataFrame
data

Unnamed: 0,tokens,tags
0,"[ORF, aspnet_client, casino, 4.8]","[O, O, O, I-version]"
1,"[3.6.3, CVE-2010-3646, Forms]","[B-version, B-cve id, B-application]"
2,"[0091, 1.15, Mandrill, repository/lib.php, HDW...","[B-version, B-version, B-application, B-file, ..."
3,"[do, gfxTextRun::GetUserData, vxveautil.kv_bin...","[O, B-function, B-function, O, I-version]"
4,"[util/doh/runner.html, JSDependentString::unde...","[B-file, B-function, I-version]"
...,...,...
5123,"[write-only, bdf/bdflib.c, EMC, Virus]","[O, B-file, B-vendor, I-application]"
5124,"[com_ninjamonials, takes, Processing]","[O, O, B-application]"
5125,"[ulp/sdp/sdp_proc.c, jpc_cox_getcompparms, 8.5...","[B-file, B-function, I-version, I-version]"
5126,"[SQL, escapes, party, ZENworks, NetArtMEDIA, A...","[I-relevant_term, O, O, B-application, B-vendo..."


In [None]:
data.to_csv("NER_main_Datas.csv", index=False)