# Machine Learning in improving the precision of CRISPR-Cas9

In [1]:
import pandas as pd

### Loading database

In [2]:
df = pd.read_csv("Data.csv")
df.head()

Unnamed: 0,23-nt sequence,30-nt sequence,modFreq
0,GGCTGCTTTACCCGCTGTGGGGG,TGGAGGCTGCTTTACCCGCTGTGGGGGCGC,0.874864
1,TCCGGGTTGGCCTTCCACTGGGG,CGTCTCCGGGTTGGCCTTCCACTGGGGCAG,0.871393
2,CAGCATCCTTCGGAAAGCTCTGG,CCCTCAGCATCCTTCGGAAAGCTCTGGACA,0.860592
3,CGGTAGAAGCAGGTAGTCTGGGG,GGCGCGGTAGAAGCAGGTAGTCTGGGGATA,0.802276
4,CATCCCGCTGCCCCAGTGGAAGG,GTGTCATCCCGCTGCCCCAGTGGAAGGCCA,0.796638


### Dropping column "30-nt sequence" as we won't be using it

In [3]:
df = df.drop("30-nt sequence", axis = 1)
df.head()

Unnamed: 0,23-nt sequence,modFreq
0,GGCTGCTTTACCCGCTGTGGGGG,0.874864
1,TCCGGGTTGGCCTTCCACTGGGG,0.871393
2,CAGCATCCTTCGGAAAGCTCTGG,0.860592
3,CGGTAGAAGCAGGTAGTCTGGGG,0.802276
4,CATCCCGCTGCCCCAGTGGAAGG,0.796638


In [4]:
lista = list(df["23-nt sequence"])
lista[0:5]

['GGCTGCTTTACCCGCTGTGGGGG',
 'TCCGGGTTGGCCTTCCACTGGGG',
 'CAGCATCCTTCGGAAAGCTCTGG',
 'CGGTAGAAGCAGGTAGTCTGGGG',
 'CATCCCGCTGCCCCAGTGGAAGG']

### Now we build lists with the features that affect the efficiency of the gRNA.

In [5]:
num_20 = []
for i in range(len(lista)):
  num_20.append(lista[i][19])

num_20[0:5]

['G', 'G', 'C', 'G', 'A']

### We construct a list in which nucleotide G at position 20 encodes the number 1 and nucleotides A, C, or T encode the number 0

In [6]:
num_20_bool =[]
for i in num_20:
  if i == "G":
    num_20_bool.append(1)
  else:
    num_20_bool.append(0)

num_20_bool[0:5]

[1, 1, 0, 1, 0]

In [7]:
num_21 = []
for i in range(len(lista)):
  num_21.append(lista[i][20])

num_21[0:5]

['G', 'G', 'T', 'G', 'A']

### We construct a list in which nucleotide C at position 21 encodes the number 1 and nucleotides A, G, and T encode the number 0.

In [8]:
num_21_bool =[]
for i in num_21:
  if i == "C":
    num_21_bool.append(1)
  else:
    num_21_bool.append(0)

num_21_bool[0:5]

[0, 0, 0, 0, 0]

In [9]:
num_a = []
for i in lista:
  num_a.append(i.count("A"))

num_a[0:5]

[1, 1, 5, 5, 4]

In [10]:
lista[2].count("A")

5

### We calculate the percentage of G + C nucleotides in the target DNAs. This is known as the GC content.

In [11]:
por_g_mas_c = []
for i in lista:
  por_g_mas_c.append((i.count("G") + i.count("C")) / 23) 

por_g_mas_c[0:5]

[0.6956521739130435,
 0.6956521739130435,
 0.5652173913043478,
 0.6086956521739131,
 0.6956521739130435]

### A GC content between 40% and 60% is favorable, while a GC content above 80% or below 35% is inefficient.

In [12]:
con_gc =[]
for i in por_g_mas_c:
  if 0.40 < i < 0.60:
    con_gc.append(1)
  else:
    con_gc.append(0)

con_gc[0:6]

[0, 0, 1, 0, 0, 0]

In [13]:
num_18 = []
for i in range(len(lista)):
  num_18.append(lista[i][17])

num_18[0:5]


['T', 'C', 'C', 'C', 'G']

### We construct a list in which nucleotide C at position 18 encodes the number 1 and nucleotides A, G, and T encode the number 0

In [14]:
num_18_bool = []
for i in num_18:
   if i == "C":
    num_18_bool.append(1)
   else:
    num_18_bool.append(0)

num_18_bool[0:5]

[0, 1, 1, 1, 0]

In [15]:
num_16 = []
for i in range(len(lista)):
  num_16.append(lista[i][15])

num_16[0:5]

['T', 'C', 'A', 'G', 'G']

In [16]:
num_16_bool = []
for i in num_16:
   if i == "C":
    num_16_bool.append(1)
   else:
    num_16_bool.append(0)

num_16_bool[0:5]

[0, 1, 0, 0, 0]

In [17]:
base = pd.DataFrame({"Lista":lista,
              "Nucleótido en 20":num_20,
              "G_20": num_20_bool,
              "Nucleótido en 21":num_21,
              "C_21":num_21_bool,
              "Número de A":num_a,
              "Porcentaje de G y C": por_g_mas_c,
              "Contenido de GC":con_gc,
              "Nucleótido en 18":num_18,
              "C_18":num_18_bool,
              "Nucleótido en 16":num_16,
              "C_16":num_16_bool,
              "modFreq":df.modFreq})

base.head()

Unnamed: 0,Lista,Nucleótido en 20,G_20,Nucleótido en 21,C_21,Número de A,Porcentaje de G y C,Contenido de GC,Nucleótido en 18,C_18,Nucleótido en 16,C_16,modFreq
0,GGCTGCTTTACCCGCTGTGGGGG,G,1,G,0,1,0.695652,0,T,0,T,0,0.874864
1,TCCGGGTTGGCCTTCCACTGGGG,G,1,G,0,1,0.695652,0,C,1,C,1,0.871393
2,CAGCATCCTTCGGAAAGCTCTGG,C,0,T,0,5,0.565217,1,C,1,A,0,0.860592
3,CGGTAGAAGCAGGTAGTCTGGGG,G,1,G,0,5,0.608696,0,C,1,G,0,0.802276
4,CATCCCGCTGCCCCAGTGGAAGG,A,0,A,0,4,0.695652,0,G,0,G,0,0.796638


### We wil not be using the columns "Nucleotide in 20", "Nucleotide in 21", "Nucleotide in 18" and "Nucleotide in 16" so we drop them

In [18]:
base = base.drop("Nucleótido en 20", axis = 1)
base = base.drop("Nucleótido en 21", axis = 1)
base = base.drop("Nucleótido en 18", axis = 1)
base = base.drop("Nucleótido en 16", axis = 1)

base.head()

Unnamed: 0,Lista,G_20,C_21,Número de A,Porcentaje de G y C,Contenido de GC,C_18,C_16,modFreq
0,GGCTGCTTTACCCGCTGTGGGGG,1,0,1,0.695652,0,0,0,0.874864
1,TCCGGGTTGGCCTTCCACTGGGG,1,0,1,0.695652,0,1,1,0.871393
2,CAGCATCCTTCGGAAAGCTCTGG,0,0,5,0.565217,1,1,0,0.860592
3,CGGTAGAAGCAGGTAGTCTGGGG,1,0,5,0.608696,0,1,0,0.802276
4,CATCCCGCTGCCCCAGTGGAAGG,0,0,4,0.695652,0,0,0,0.796638


In [19]:
X = base[["G_20", "C_21", "Número de A", "Contenido de GC", "C_18", "C_16"]]
y = base["modFreq"]

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression

In [None]:
modelo = LinearRegression()
modelo.fit(X_train, y_train)

In [24]:
modelo.score(X_test, y_test)

0.03960358524315999

### Ridge regularization

In [25]:
from sklearn.linear_model import Ridge

In [26]:
modelo1 = Ridge(alpha=130.0)
modelo1.fit(X_train, y_train)

In [27]:
modelo1.score(X_test, y_test)

0.03559779601115842

### Lasso Regularization

In [28]:
from sklearn.linear_model import Lasso

In [29]:
modelo2 = Lasso(alpha=10.0)
modelo2.fit(X_train, y_train)

In [30]:
modelo2.score(X_test, y_test)

-0.004562944633035748