In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://www.stressmarq.com/wp-content/uploads/SOD1-Aggregation-in-ALS-1024x512.png)https://www.stressmarq.com/sod1-aggregation-in-als/?v=3e8d115eb4b3

<h1 style="background-color:#DC143C; font-family:'Brush Script MT',cursive;color:white;font-size:200%; text-align:center;border-radius: 50% 20% / 10% 40%">SOD1 (Superoxide Dismutase 1)</h1>

"SOD1 (Superoxide Dismutase 1) is a Protein Coding gene. Diseases associated with SOD1 include Amyotrophic Lateral Sclerosis (ALS) and Spastic Tetraplegia And Axial Hypotonia, Progressive." 

"Among its related pathways are Association Between Physico-Chemical Features and Toxicity Associated Pathways and Response to elevated platelet cytosolic Ca2+. Gene Ontology (GO) annotations related to this gene include protein homodimerization activity and enzyme binding. An important paralog of this gene is CCS."

https://www.genecards.org/cgi-bin/carddisp.pl?gene=SOD1

# **<span style="color:#DC143C;">Data from Mutant SOD1</span>**

"Expression profiling of spinal cord from SOD1(G93A) mice and age matched controls at ages 28, 42, 56, 70, 98, 112, and 126 days of age. The authors used microarrays to determine differential gene expression throughout disease progression in the spinal cord of mutant SOD1(G93A) model of ALS."

"Samples were collected from male B6SJL SOD1(G93A) and age matched controls. 3 samples were collected representing each genotype and age group for RNA extraction and hybridization on Affymetrix microarrays."

http://biogps.org/#goto=genereport&id=12566&show_dataset=E-GEOD-18597

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
df = pd.read_csv('../input/cusersmarildownloadsmutantcsv/mutant.csv', delimiter=';', encoding = "ISO-8859-2", nrows = nRowsRead)
df.dataframeName = 'mutant.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.head().style.set_properties(**{'background-color':'red',
                                     'color': 'black'})

Many forms of ALS, regardless of the offending primary gene mutation, show TDP-43 pathology; exceptions are cases associated with SOD1 and FUS gene mutations.

https://www.kaggle.com/mpwolke/als-dna-genetics (ALS DNA & Genetics).

In [None]:
df.isnull().sum()

**<span style="color:#DC143C;">Protein attributes for SOD1 Gene</span>**

Size: 154 amino acids

Molecular mass: 15936 Da

Cofactor: Name=Cu cation; Xref=ChEBI:CHEBI:23378;

Cofactor: Name=Zn(2+); Xref=ChEBI:CHEBI:29105;

Quaternary structure: Homodimer; non-disulfide linked. Homodimerization may take place via the ditryptophan cross-link at Trp-33. The pathogenic variants ALS1 Arg-38, Arg-47, Arg-86 and Ala-94 interact with RNF19A, whereas wild-type protein does not. The pathogenic variants ALS1 Arg-86 and Ala-94 interact with MARCH5, whereas wild-type protein does not.

Miscellaneous: The protein (both wild-type and ALS1 variants) has a tendency to form fibrillar aggregates in the absence of the intramolecular disulfide bond or of bound zinc ions. These aggregates may have cytotoxic effects. Zinc binding promotes dimerization and stabilizes the native form.

https://www.genecards.org/cgi-bin/carddisp.pl?gene=SOD1

In [None]:
# donut plot
feature_names = "Samples","1416873_a_at","1447617_at"
feature_size = [len('Samples'),len('1416873_a_at'),len('1447617_at')]
# create a circle for the center of plot
circle = plt.Circle((0,0),0.2,color = "white")
plt.pie(feature_size, labels = feature_names, colors = ["red","green","blue"] )
p = plt.gcf()
p.gca().add_artist(circle)
plt.title("Number of Each Feature")
plt.show()

# **<span style="color:#DC143C;">SOD1 Aggregation in ALS</span>**

Author: Patricia Thomson - JUNE 6TH, 2019 

**<span style="color:#DC143C;">Linking SOD1 and ALS</span>**

"Approximately 20% of fALS (family history ALS) cases are associated with SOD1 mutations. Over 150 mutations have been implicated. Most are point mutations; it is unclear whether all of these are sufficient to cause ALS as single mutations or if several could be needed in conjunction. SOD1 mutations can also occur in sALS, and have been associated with variations in fALS survival times."

"It has been suggested that mutations cause SOD1 to lose its ability to scavenge superoxide radicals, leading to increased oxidative stress, but it now seems more likely that SOD1 contributes to ALS by gaining a toxic function rather than losing its regular function. This toxic gain-of function is thought to be SOD1 misfolding and aggregation into oligomers and ultimately larger aggregates. This idea is supported by the presence of mutant SOD1–containing aggregates that increase in abundance as disease progresses."

https://www.stressmarq.com/sod1-aggregation-in-als/?v=3e8d115eb4b3

In [None]:
fig = px.bar(df, x= "Samples", y= "1416873_a_at", color_discrete_sequence=['crimson'], title= 'Mutant SOD1 Samples')
fig.show()

# **<span style="color:#DC143C;">How SOD1 Mutations Lead to Aggregation</span>**

SOD1 usually undergoes the following post-translational modifications:

Copper Insertion, Zinc Insertion, Dimerization, Disulfide Bond Formatio.

"Many fALS-associated mutations disrupt these post-translational modifications, preventing the proper structure and folding of the SOD1 protein. Misfolded SOD1 is prone to aggregation into soluble oligomers, via the formation of intermolecular disulfide bonds between the free cysteine residues of different SOD1 molecules and non-covalent interactions (hydrogen bonds) between beta strands of SOD1 subunits."

**<span style="color:#DC143C;">Demetallation</span>**

"Apo, or completely demetallated SOD1 is susceptible to oligomerization, as is zinc-deficient SOD1. FALS-associated mutations are thought to lower the zinc binding affinity of SOD1 by altering the zinc binding geometry. One possibility is that mutations that perturb the electrostatic loop allow solvent to access the metal sites, preventing metallation with Cu and Zn."

**<span style="color:#DC143C;">Dimerization</span>**

"Dimerization of SOD1 reduces the surface area that is accessible to solvent, which increases its stability. When dimerization is disrupted, unstable monomers are prone to aggregation. fALS-associated mutations occur on the dimer interphase and can cause dimers to dissociate into monomers which act as aggregation templates."

**<span style="color:#DC143C;">Disulfide Bonding</span>**

"Each SOD1 subunit contains a disulfide bond between two of cysteine residues (Cys 57 and Cys 146). Intrasubunit disulfide bonding increases SOD1 stability, so its interruption can result in aggregation of unstable species. Intrasubunit disulfide bond reduction is necessary for fibril initiation and promotes faster seeding of aggregates."

https://www.stressmarq.com/sod1-aggregation-in-als/?v=3e8d115eb4b3

In [None]:
fig = px.bar(df, x= "1447617_at", y= "Samples", color_discrete_sequence=['#2B3A67'], title= 'Mutant SOD1 Samples')
fig.show()

# **<span style="color:#DC143C;">Mechanisms of SOD1 Toxicity</span>**

"As in other neurodegenerative diseases, the oligomeric form of SOD1 is likely more toxic than large SOD1 aggregates. Some hypothesize that aggregation is a protective mechanism against these toxic oligomers.These oligomers are thought to contain antiparallel, out-of-register β-sheet structures involving segment."

There are several proposed mechanisms of toxicity for SOD1 oligomers:

**<span style="color:#DC143C;">Excitotoxicity</span>**

"An increase in the extracellular glutamate concentration causes an influx of calcium into the postsynaptic neuron, which can result in mitochondrial damage and ultimately apoptosis. Riluzole, a drug that extends survival for ALS patients by 2-3 months, is thought to negate excitotoxicity."

**<span style="color:#DC143C;">Endoplasmic Reticulum Stress</span>**

"Mutant SOD1 interacts with ER-associated degradation machinery (ERAD) and causes it to malfunction.  This can also lead to apoptosis."

**<span style="color:#DC143C;">Axonal Transport Disruption</span>**

"Mutant SOD1 can induce axonal transport defects by reducing microtubule stability and disrupting mitochondrial transport."

**<span style="color:#DC143C;">Oxidative Stress and Mitochondrial Damage</span>**

"Oxidative stress is associated with a variety of disease states, and oxidative stress is thought to  contribute to the pathogenesis of sporadic ALS. The responsible ROS may be a product of mitochondrial dysfunction due to the accumulation of aggregated SOD1. The release of mitochondrial Ca2+ can also contribute to apoptosis."

**<span style="color:#DC143C;">Non-Cell Autonomous Toxicity</span>**

"ALS is considered to be non-cell autonomous, meaning that other non-neuronal cells, such as astrocytes and microglia also contribute to pathogenesis and disease progression."

https://www.stressmarq.com/sod1-aggregation-in-als/?v=3e8d115eb4b3

In [None]:
# 3D Scatter Plot
fig = px.scatter_3d(df, x='1416873_a_at', y='1447617_at', z='Samples')
fig.show()

#Not much "scatters" in this 3D. What is weird is that the number of these Probes (e.g.1447617_at) are the same of the "Cytokine inhibition, production and autoimmunity" Dataset and cytokine file.

In [None]:
#Code by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion

!pip install mglearn==0.1.9

In [None]:
%matplotlib inline
import sys
from scipy import sparse
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
import matplotlib
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
import scipy as sp
print("SciPy version: {}".format(sp.__version__))
import IPython
print("IPython version: {}".format(IPython.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))
import mglearn
import matplotlib.pyplot as plt

In [None]:
mglearn.plots.plot_linear_regression_wave()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X, y = mglearn.datasets.make_wave(n_samples=60)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
df = pd.get_dummies(df)

In [None]:
lr = LinearRegression().fit(X_train, y_train)

In [None]:
#Code by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion

print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))

The “slope” parameters (w), also called weights or coefficients, are stored in the coef attribute, while the offset or intercept (b) is stored in the intercept attribute

In [None]:
#Code by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion

print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

"The test score of around 0.66 is not very impressive, but we can see training and test score are very close to each other. This implies that we are likely underfitting and not overfitting. Although for this one dimensional dataset there is a little danger of overfitting as the model is very simple. Moreover, with higher dimensional datasets which has large number of features, linear models become quite powerful and thus more chance of overfitting."

My numbers are exactly Salman numbers in another Dataset???

#Ridge Regression

In [None]:
#Code by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion

from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

In [None]:
#Code by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion

ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))

In [None]:
#setting alpha=0.1 will give better score

ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge01.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge01.score(X_test, y_test)))

In [None]:
plt.plot(ridge.coef_, 's', label="Ridge alpha=1")
plt.plot(ridge10.coef_, '^', label="Ridge alpha=10")
plt.plot(ridge01.coef_, 'v', label="Ridge alpha=0.1")
plt.plot(lr.coef_, 'o', label="LinearRegression")
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.hlines(0, 0, len(lr.coef_))
plt.ylim(-25, 25)
plt.legend()

In [None]:
mglearn.plots.plot_ridge_n_samples()

In [None]:
#Code by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion

from sklearn.linear_model import Lasso
lasso = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

In [None]:
# we increase the default setting of "max_iter",
# otherwise the model would warn us that we should increase max_iter.
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso001.coef_ != 0)))

In [None]:
#Code by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion

lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso00001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso00001.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso00001.coef_ != 0)))

In [None]:
#Code by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion

plt.plot(lasso.coef_, 's', label="Lasso alpha=1")
plt.plot(lasso001.coef_, '^', label="Lasso alpha=0.01")
plt.plot(lasso00001.coef_, 'v', label="Lasso alpha=0.0001")
plt.plot(ridge01.coef_, 'o', label="Ridge alpha=0.1")
plt.legend(ncol=2, loc=(0, 1.05))
plt.ylim(-25, 25)
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")

#All script above is by Salman Ibne Eunus  https://www.kaggle.com/salmaneunus/linear-models-and-regularization-for-regresssion 

The results are IDENTICAL from Salman's Boston Housing Dataset, however I'm working with Mutant Genes SOD1. Therefore, the results above are wrong, except the Mutant SOD1 file.