SVM for Classification of Salmonella Plasmids into two classes: (virulent and non-virulent)

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
data = pd.read_excel("sal.xlsx")
sequences = data["Sequence"]
data["Virulent"] = data["Virulent"].fillna(0).replace("Yes", 1)


In [2]:
import pandas as pd
import itertools

# Defining the function to calculate n-gram frequencies
def calculate_ngram_frequencies(sequence, n):
    ngram_freq = {}
    for i in range(len(sequence) - n + 1):
        ngram = sequence[i:i+n]
        ngram_freq[ngram] = ngram_freq.get(ngram, 0) + 1
    return ngram_freq

# Defining the function to create feature vectors
def create_feature_vector(row, n, feature_vector):
    sequence = row['Sequence']
    ngram_freq = calculate_ngram_frequencies(sequence, n)
    # Initializing the feature vector with zeros
    vector = feature_vector.copy()
    # Updating the feature vector with the frequency of each n-gram
    for ngram, freq in ngram_freq.items():
        vector[ngram] = freq
    return pd.Series(vector)



df = {'Sequence': data['Sequence'].tolist()}  # Extracting the 'Sequence' column as a list

# Converting the list of sequences to a dictionary with key 'Sequence'
data_dict = {'Sequence': df['Sequence']}

datF = pd.DataFrame(data_dict)
# Defining the value of n for n-grams
n = 3

# Generating the feature vector template
amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # A string representing the 20 standard amino acids
feature_vector = {"".join(aa): 0 for aa in itertools.product(amino_acids, repeat=n)}

# Applying the function to create feature vectors for each row
feature_vectors = datF.apply(lambda row: create_feature_vector(row, n, feature_vector), axis=1)

# Combining sequence and feature vectors into a single DataFrame
result_df = pd.concat([datF['Sequence'], feature_vectors], axis=1)
data = pd.concat([data, result_df.drop(columns=['Sequence'])], axis=1, join='inner')










In [3]:
data

Unnamed: 0,Entry,Virulent,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology IDs,Sequence,AAA,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,A0A3T2ZW19,1,A0A3T2ZW19_SALET,Fatty acid oxidation complex subunit alpha [In...,fadB A4K93_08410 ATM10_11635 B9761_19260 CDK17...,Salmonella enterica subsp. enterica serovar Sc...,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,0,1,0,0,0,0,0,0,0
1,A0A3V4QAN5,0,A0A3V4QAN5_SALTH,Fatty acid oxidation complex subunit alpha [In...,fadB A9W32_22425 D6S79_17605 DSR63_19710 DYM67...,Salmonella thompson,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,0,1,0,0,0,0,0,0,0
2,A0A3V4QBH0,0,A0A3V4QBH0_SALTH,Siroheme synthase [Includes: Uroporphyrinogen-...,cobA cysG A9W32_19130 D4478_22640 D6S79_23280 ...,Salmonella thompson,457,GO:0004851; GO:0009236; GO:0019354; GO:0032259...,MDHLPIFCQLRDRDCLIVGGGDVAERKARLLLEAGARLTVNALTFI...,0,...,0,0,0,0,0,0,0,0,0,0
3,A0A3Y4A9R8,0,A0A3Y4A9R8_SALSE,Fatty acid oxidation complex subunit alpha [In...,fadB A0E85_19810 AA192_21415 AE408_20440 AL785...,Salmonella senftenberg,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,0,1,0,0,0,0,0,0,0
4,A0A401AN88,0,A0A401AN88_SALSE,Siroheme synthase [Includes: Uroporphyrinogen-...,cobA cysG A0E85_24000 A3030_23415 AA192_12785 ...,Salmonella senftenberg,457,GO:0004851; GO:0009236; GO:0019354; GO:0032259...,MDHLPIFCQLRDRDCLIVGGGDVAERKARLLLEAGARLTVNALTFI...,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,A0A403SIB8,0,A0A403SIB8_SALTH,DNA protection during starvation protein (EC 1...,dps pexB A9W32_12310 D6S79_19340 DSR63_18085 D...,Salmonella thompson,167,GO:0003677; GO:0005737; GO:0006879; GO:0008199...,MSTAKLVKTKASNLLYTRNDVSESDKKATVELLNRQVIQFIDLSLI...,0,...,0,0,0,0,0,0,0,0,0,0
995,A0A403SIE5,0,A0A403SIE5_SALTH,Probable phosphoglycerate mutase GpmB (EC 5.4....,gpmB A9W32_07200 D6S79_17020 DSR63_08815 DYM67...,Salmonella thompson,215,GO:0004619; GO:0006096; GO:0016020,MLQVYLVRHGETQWNAERRIQGQSDSPLTAKGEQQAMQVGERARSL...,0,...,0,0,0,0,0,0,0,0,0,0
996,A0A403SIE9,0,A0A403SIE9_SALTH,Pyruvate dehydrogenase [ubiquinone] (EC 1.2.5....,poxB A9W32_10525 D6S79_19235 DSR63_17710 DYM67...,Salmonella thompson,572,GO:0000287; GO:0005886; GO:0008289; GO:0030976...,MKQTVAAFIAKTLEQAGVKQIWGVTGDSLNGLSDSLNRMGTIEWMP...,0,...,0,0,0,0,0,0,0,0,0,0
997,A0A403SIF2,0,A0A403SIF2_SALTH,Isoleucine--tRNA ligase (EC 6.1.1.5) (Isoleucy...,ileS A9W32_19180 D6S79_16710 DSR63_08505 DYM67...,Salmonella thompson,944,GO:0000049; GO:0002161; GO:0004822; GO:0005524...,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#code for One-Hot encoding
encoder = OneHotEncoder(handle_unknown='ignore')  # Ignore unknown characters
encoded_sequences = encoder.fit_transform(sequences.values.reshape(-1, 1))
encoded_features = encoded_sequences.toarray()


In [5]:
import numpy
virulence = data["Virulent"]
virulence = virulence.to_numpy() 



In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_selection import chi2
go_terms = data["Gene Ontology IDs"]
vectorizer = TfidfVectorizer()
go_features = vectorizer.fit_transform(go_terms)
go_features = go_features.T.tocsr()  # Transpose and convert to compressed sparse row format
go_features = vectorizer.fit_transform(go_terms)



In [7]:
chi2_scores, pvals = chi2(go_features, virulence)

In [8]:
sorted_indices = numpy.argsort(pvals)
sorted_indices

array([1012,  246, 1042, ...,  252,  296,  291], dtype=int64)

In [9]:
significant_terms = vectorizer.get_feature_names_out()[sorted_indices[:250]]
significant_terms= ["GO:" + term for term in significant_terms]



In [10]:
for term, score in zip(significant_terms, chi2_scores[sorted_indices[:100]]):
    print(f"- {term}: chi2 score = {score:.4f}, p-value = {pvals[sorted_indices[0]]:.4f}") 

- GO:0090729: chi2 score = 10.9271, p-value = 0.0009
- GO:0005576: chi2 score = 10.5461, p-value = 0.0009
- GO:0106274: chi2 score = 8.2287, p-value = 0.0009
- GO:0016779: chi2 score = 8.2287, p-value = 0.0009
- GO:0048038: chi2 score = 6.5719, p-value = 0.0009
- GO:0015444: chi2 score = 5.9652, p-value = 0.0009
- GO:0008137: chi2 score = 5.2116, p-value = 0.0009
- GO:1901505: chi2 score = 4.8427, p-value = 0.0009
- GO:0042221: chi2 score = 4.8427, p-value = 0.0009
- GO:0009986: chi2 score = 4.5637, p-value = 0.0009
- GO:0030254: chi2 score = 4.2231, p-value = 0.0009
- GO:0030257: chi2 score = 4.2231, p-value = 0.0009
- GO:0042773: chi2 score = 3.9668, p-value = 0.0009
- GO:0050136: chi2 score = 3.8734, p-value = 0.0009
- GO:0016817: chi2 score = 3.6709, p-value = 0.0009
- GO:0070417: chi2 score = 3.6709, p-value = 0.0009
- GO:0004222: chi2 score = 3.4221, p-value = 0.0009
- GO:0052042: chi2 score = 3.3698, p-value = 0.0009
- GO:0120258: chi2 score = 3.3698, p-value = 0.0009
- GO:00309

In [11]:
data

Unnamed: 0,Entry,Virulent,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology IDs,Sequence,AAA,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,A0A3T2ZW19,1,A0A3T2ZW19_SALET,Fatty acid oxidation complex subunit alpha [In...,fadB A4K93_08410 ATM10_11635 B9761_19260 CDK17...,Salmonella enterica subsp. enterica serovar Sc...,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,0,1,0,0,0,0,0,0,0
1,A0A3V4QAN5,0,A0A3V4QAN5_SALTH,Fatty acid oxidation complex subunit alpha [In...,fadB A9W32_22425 D6S79_17605 DSR63_19710 DYM67...,Salmonella thompson,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,0,1,0,0,0,0,0,0,0
2,A0A3V4QBH0,0,A0A3V4QBH0_SALTH,Siroheme synthase [Includes: Uroporphyrinogen-...,cobA cysG A9W32_19130 D4478_22640 D6S79_23280 ...,Salmonella thompson,457,GO:0004851; GO:0009236; GO:0019354; GO:0032259...,MDHLPIFCQLRDRDCLIVGGGDVAERKARLLLEAGARLTVNALTFI...,0,...,0,0,0,0,0,0,0,0,0,0
3,A0A3Y4A9R8,0,A0A3Y4A9R8_SALSE,Fatty acid oxidation complex subunit alpha [In...,fadB A0E85_19810 AA192_21415 AE408_20440 AL785...,Salmonella senftenberg,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,0,1,0,0,0,0,0,0,0
4,A0A401AN88,0,A0A401AN88_SALSE,Siroheme synthase [Includes: Uroporphyrinogen-...,cobA cysG A0E85_24000 A3030_23415 AA192_12785 ...,Salmonella senftenberg,457,GO:0004851; GO:0009236; GO:0019354; GO:0032259...,MDHLPIFCQLRDRDCLIVGGGDVAERKARLLLEAGARLTVNALTFI...,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,A0A403SIB8,0,A0A403SIB8_SALTH,DNA protection during starvation protein (EC 1...,dps pexB A9W32_12310 D6S79_19340 DSR63_18085 D...,Salmonella thompson,167,GO:0003677; GO:0005737; GO:0006879; GO:0008199...,MSTAKLVKTKASNLLYTRNDVSESDKKATVELLNRQVIQFIDLSLI...,0,...,0,0,0,0,0,0,0,0,0,0
995,A0A403SIE5,0,A0A403SIE5_SALTH,Probable phosphoglycerate mutase GpmB (EC 5.4....,gpmB A9W32_07200 D6S79_17020 DSR63_08815 DYM67...,Salmonella thompson,215,GO:0004619; GO:0006096; GO:0016020,MLQVYLVRHGETQWNAERRIQGQSDSPLTAKGEQQAMQVGERARSL...,0,...,0,0,0,0,0,0,0,0,0,0
996,A0A403SIE9,0,A0A403SIE9_SALTH,Pyruvate dehydrogenase [ubiquinone] (EC 1.2.5....,poxB A9W32_10525 D6S79_19235 DSR63_17710 DYM67...,Salmonella thompson,572,GO:0000287; GO:0005886; GO:0008289; GO:0030976...,MKQTVAAFIAKTLEQAGVKQIWGVTGDSLNGLSDSLNRMGTIEWMP...,0,...,0,0,0,0,0,0,0,0,0,0
997,A0A403SIF2,0,A0A403SIF2_SALTH,Isoleucine--tRNA ligase (EC 6.1.1.5) (Isoleucy...,ileS A9W32_19180 D6S79_16710 DSR63_08505 DYM67...,Salmonella thompson,944,GO:0000049; GO:0002161; GO:0004822; GO:0005524...,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
data["filtered_GO_terms"] = data["Gene Ontology IDs"].apply(
    lambda x: [term for term in x.split(";") if term in significant_terms]
)




In [13]:
# Check for empty lists and potentially handle them (e.g., remove rows)
data = data[data["filtered_GO_terms"].apply(len).gt(0)]
data

Unnamed: 0,Entry,Virulent,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology IDs,Sequence,AAA,...,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY,filtered_GO_terms
0,A0A3T2ZW19,1,A0A3T2ZW19_SALET,Fatty acid oxidation complex subunit alpha [In...,fadB A4K93_08410 ATM10_11635 B9761_19260 CDK17...,Salmonella enterica subsp. enterica serovar Sc...,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,1,0,0,0,0,0,0,0,[GO:0003857]
1,A0A3V4QAN5,0,A0A3V4QAN5_SALTH,Fatty acid oxidation complex subunit alpha [In...,fadB A9W32_22425 D6S79_17605 DSR63_19710 DYM67...,Salmonella thompson,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,1,0,0,0,0,0,0,0,[GO:0003857]
2,A0A3V4QBH0,0,A0A3V4QBH0_SALTH,Siroheme synthase [Includes: Uroporphyrinogen-...,cobA cysG A9W32_19130 D4478_22640 D6S79_23280 ...,Salmonella thompson,457,GO:0004851; GO:0009236; GO:0019354; GO:0032259...,MDHLPIFCQLRDRDCLIVGGGDVAERKARLLLEAGARLTVNALTFI...,0,...,0,0,0,0,0,0,0,0,0,[GO:0004851]
3,A0A3Y4A9R8,0,A0A3Y4A9R8_SALSE,Fatty acid oxidation complex subunit alpha [In...,fadB A0E85_19810 AA192_21415 AE408_20440 AL785...,Salmonella senftenberg,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,0,1,0,0,0,0,0,0,0,[GO:0003857]
4,A0A401AN88,0,A0A401AN88_SALSE,Siroheme synthase [Includes: Uroporphyrinogen-...,cobA cysG A0E85_24000 A3030_23415 AA192_12785 ...,Salmonella senftenberg,457,GO:0004851; GO:0009236; GO:0019354; GO:0032259...,MDHLPIFCQLRDRDCLIVGGGDVAERKARLLLEAGARLTVNALTFI...,0,...,0,0,0,0,0,0,0,0,0,[GO:0004851]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,A0A403SI77,0,A0A403SI77_SALTH,4-hydroxy-3-methylbut-2-enyl diphosphate reduc...,ispH lytB A9W32_19165 D6S79_16695 DSR63_08490 ...,Salmonella thompson,316,GO:0016114; GO:0019288; GO:0046872; GO:0050992...,MQILLANPRGFCAGVDRAISIVENALAIYGAPIYVRHEVVHNRYVV...,0,...,0,0,0,0,0,0,0,0,0,[GO:0016114]
994,A0A403SIB8,0,A0A403SIB8_SALTH,DNA protection during starvation protein (EC 1...,dps pexB A9W32_12310 D6S79_19340 DSR63_18085 D...,Salmonella thompson,167,GO:0003677; GO:0005737; GO:0006879; GO:0008199...,MSTAKLVKTKASNLLYTRNDVSESDKKATVELLNRQVIQFIDLSLI...,0,...,0,0,0,0,0,0,0,0,0,[GO:0003677]
995,A0A403SIE5,0,A0A403SIE5_SALTH,Probable phosphoglycerate mutase GpmB (EC 5.4....,gpmB A9W32_07200 D6S79_17020 DSR63_08815 DYM67...,Salmonella thompson,215,GO:0004619; GO:0006096; GO:0016020,MLQVYLVRHGETQWNAERRIQGQSDSPLTAKGEQQAMQVGERARSL...,0,...,0,0,0,0,0,0,0,0,0,[GO:0004619]
996,A0A403SIE9,0,A0A403SIE9_SALTH,Pyruvate dehydrogenase [ubiquinone] (EC 1.2.5....,poxB A9W32_10525 D6S79_19235 DSR63_17710 DYM67...,Salmonella thompson,572,GO:0000287; GO:0005886; GO:0008289; GO:0030976...,MKQTVAAFIAKTLEQAGVKQIWGVTGDSLNGLSDSLNRMGTIEWMP...,0,...,0,0,0,0,0,0,0,0,0,[GO:0000287]


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [15]:
# Extracting unique GO terms
unique_go_terms = set(term for sublist in data['filtered_GO_terms'] for term in sublist)

# Creating a dictionary to map each GO term to a unique integer label
go_label_map = {go_term: label for label, go_term in enumerate(unique_go_terms)}

# Defining a function to assign unique integer labels to each GO term
def assign_labels(go_terms):
    return [go_label_map[term] for term in go_terms]

# Applying the function to the filtered_GO_terms column
data['go_labels'] = data['filtered_GO_terms'].apply(assign_labels)
data.loc[:, 'go_labels'] = data['go_labels'].apply(lambda x: x[0])
data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['go_labels'] = data['filtered_GO_terms'].apply(assign_labels)


Unnamed: 0,Entry,Virulent,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology IDs,Sequence,AAA,...,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY,filtered_GO_terms,go_labels
0,A0A3T2ZW19,1,A0A3T2ZW19_SALET,Fatty acid oxidation complex subunit alpha [In...,fadB A4K93_08410 ATM10_11635 B9761_19260 CDK17...,Salmonella enterica subsp. enterica serovar Sc...,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,1,0,0,0,0,0,0,0,[GO:0003857],51
1,A0A3V4QAN5,0,A0A3V4QAN5_SALTH,Fatty acid oxidation complex subunit alpha [In...,fadB A9W32_22425 D6S79_17605 DSR63_19710 DYM67...,Salmonella thompson,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,1,0,0,0,0,0,0,0,[GO:0003857],51
2,A0A3V4QBH0,0,A0A3V4QBH0_SALTH,Siroheme synthase [Includes: Uroporphyrinogen-...,cobA cysG A9W32_19130 D4478_22640 D6S79_23280 ...,Salmonella thompson,457,GO:0004851; GO:0009236; GO:0019354; GO:0032259...,MDHLPIFCQLRDRDCLIVGGGDVAERKARLLLEAGARLTVNALTFI...,0,...,0,0,0,0,0,0,0,0,[GO:0004851],10
3,A0A3Y4A9R8,0,A0A3Y4A9R8_SALSE,Fatty acid oxidation complex subunit alpha [In...,fadB A0E85_19810 AA192_21415 AE408_20440 AL785...,Salmonella senftenberg,729,GO:0003857; GO:0004165; GO:0004300; GO:0006635...,MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGQALEVL...,1,...,1,0,0,0,0,0,0,0,[GO:0003857],51
4,A0A401AN88,0,A0A401AN88_SALSE,Siroheme synthase [Includes: Uroporphyrinogen-...,cobA cysG A0E85_24000 A3030_23415 AA192_12785 ...,Salmonella senftenberg,457,GO:0004851; GO:0009236; GO:0019354; GO:0032259...,MDHLPIFCQLRDRDCLIVGGGDVAERKARLLLEAGARLTVNALTFI...,0,...,0,0,0,0,0,0,0,0,[GO:0004851],10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,A0A403SI77,0,A0A403SI77_SALTH,4-hydroxy-3-methylbut-2-enyl diphosphate reduc...,ispH lytB A9W32_19165 D6S79_16695 DSR63_08490 ...,Salmonella thompson,316,GO:0016114; GO:0019288; GO:0046872; GO:0050992...,MQILLANPRGFCAGVDRAISIVENALAIYGAPIYVRHEVVHNRYVV...,0,...,0,0,0,0,0,0,0,0,[GO:0016114],45
994,A0A403SIB8,0,A0A403SIB8_SALTH,DNA protection during starvation protein (EC 1...,dps pexB A9W32_12310 D6S79_19340 DSR63_18085 D...,Salmonella thompson,167,GO:0003677; GO:0005737; GO:0006879; GO:0008199...,MSTAKLVKTKASNLLYTRNDVSESDKKATVELLNRQVIQFIDLSLI...,0,...,0,0,0,0,0,0,0,0,[GO:0003677],2
995,A0A403SIE5,0,A0A403SIE5_SALTH,Probable phosphoglycerate mutase GpmB (EC 5.4....,gpmB A9W32_07200 D6S79_17020 DSR63_08815 DYM67...,Salmonella thompson,215,GO:0004619; GO:0006096; GO:0016020,MLQVYLVRHGETQWNAERRIQGQSDSPLTAKGEQQAMQVGERARSL...,0,...,0,0,0,0,0,0,0,0,[GO:0004619],29
996,A0A403SIE9,0,A0A403SIE9_SALTH,Pyruvate dehydrogenase [ubiquinone] (EC 1.2.5....,poxB A9W32_10525 D6S79_19235 DSR63_17710 DYM67...,Salmonella thompson,572,GO:0000287; GO:0005886; GO:0008289; GO:0030976...,MKQTVAAFIAKTLEQAGVKQIWGVTGDSLNGLSDSLNRMGTIEWMP...,0,...,0,0,0,0,0,0,0,0,[GO:0000287],42


In [16]:
import pandas as pd

column_names = data.columns[9:-1].tolist()  # Get column names from data

# Defining columns to include in X
columns_to_include = ['Length', 'go_labels']
columns_to_include.extend(column_names)


x = data[columns_to_include]
y = data['Virulent']
x.drop(columns=['filtered_GO_terms'], inplace=True)
x




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(columns=['filtered_GO_terms'], inplace=True)


Unnamed: 0,Length,go_labels,AAA,AAC,AAD,AAE,AAF,AAG,AAH,AAI,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,729,51,1,0,0,0,1,2,0,0,...,0,0,1,0,0,0,0,0,0,0
1,729,51,1,0,0,0,1,2,0,0,...,0,0,1,0,0,0,0,0,0,0
2,457,10,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,729,51,1,0,0,0,1,2,0,0,...,0,0,1,0,0,0,0,0,0,0
4,457,10,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,316,45,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
994,167,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
995,215,29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,572,42,0,0,2,0,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)


In [18]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.75      0.74        57
           1       0.00      0.00      0.00        17

    accuracy                           0.58        74
   macro avg       0.36      0.38      0.37        74
weighted avg       0.55      0.58      0.57        74

