# Classification of crystal system for batteries (100% acc)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Install chemparse
!pip install chemparse

In [None]:
# Import modules
import chemparse

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import xgboost

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

### Importing the data

In [None]:
df = pd.read_csv("/kaggle/input/crystal-system-properties-for-liion-batteries/lithium-ion batteries.csv")

In [None]:
df.head()

In [None]:
len(df)

#### We see that the dataset is pretty small, only 339 observations.
#### We look at the str-type columns and how their unique values look like in order to get an idea of where to use dummies and not.

In [None]:
for col in ["Formula", "Spacegroup", "Crystal System"]:
    print(f"Column: {col}")
    print(f"Values: {df[col].unique()}\n")
    print("------------------------------------------------------")

### Chemical formulas
#### We are going to use the chemparse-module (https://pypi.org/project/chemparse/) to parse the forumla column. We basically want each molecule to be split into columns of each atom with values being the amount of atoms in the particular molecule. So for an H2O molecule would the column "H" have a value 1 and column O would have a 2. You get it.

In [None]:
# Parse the forumlas into a dictionary format
chem_df = df.Formula.apply(chemparse.parse_formula)

# Convert the dictionary into a dataframe and fill NaN's with zero's
chem_df = pd.json_normalize(chem_df)
chem_df = chem_df.fillna(0)

# Join back into the original df
df = df.join(chem_df)

# Final result:
df.head()

In [None]:
# Let's not forget to drop the original formula column
df = df.drop(columns=["Formula"])

### Let's look at spacegroup

In [None]:
# Let's look at Spacegroup
plt.figure(figsize=(12,9))

spaceg_data = df.Spacegroup.value_counts()

spaceg_data.plot(kind="bar")
plt.show()

We turn the spacegroups into categorical numerical variables

In [None]:
df.Spacegroup = df.Spacegroup.rank(method="dense").astype(int)

### Looking at bandstructure

In [None]:
df["Has Bandstructure"].value_counts().plot(kind="bar")

Mapping the values to 1 for true, 0 for false

In [None]:

df["Has Bandstructure"] = df["Has Bandstructure"].map({True:1, False:0})

### Final processing: get dummies for Crystal System and drop the ID-column

In [None]:
df = df.drop(columns=["Materials Id"])
df["Crystal System"] = df["Crystal System"].rank(method="dense").astype(int)

### Let's also look at a heatmap of all features now that they all are numerical

In [None]:
correlations = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(correlations, annot=True)

### Classification time
#### Let's split up the dataframe into feature data and labels and look at how the label is distributed

In [None]:
# Let's split up the dataframe into feature data and labels
y = df["Crystal System"]
X = df.drop(columns=["Crystal System"])

In [None]:
# Quick look of distribution of Crystal System values
yvals = y.value_counts() / len(y) * 100
yvals.plot(kind="bar")
plt.xlabel("Class")
plt.ylabel("% of dataset")
plt.show()

#### We can see from this plot that a baseline model that only predict's "1" would score a 40% accuracy. In order to get any significant result, we need to score better than this.

### Classifiers
#### We are going to use Decision Tree Classifier aswell as an Xgboost classifier and compare their results.

In [None]:
# 80% training data and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Decision tree
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
# Xgboost classifier
clf = xgboost.XGBClassifier(verbosity=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

### Look at that!
#### Both models scored 100% on the testing data. Now, the dataset is pretty small, only a bit over 300 observations. Let's do a Kfold of the dataset with Xgboost and see how the results vary across different splits of the data

In [None]:
# Let's do 10 splits of the data

kfold = KFold(n_splits = 10)

n = 1

clf_xg = xgboost.XGBClassifier(verbosity=0)
clf_dt = DecisionTreeClassifier()

# Convert the data to np-arrays to enable indexing in the loop
X_data = np.array(X)
y_data = np.array(y)

DT_scores = []
XG_scores = []

for train, val in kfold.split(X_data, y_data):
    
    # Xgboost
    clf_xg.fit(X_data[train], y_data[train])
    acc = clf_xg.score(X_data[val], y_data[val])
    XG_scores.append(acc)
    print(f"XG: Fold {n} ACC: {acc}")

    # Decision tree
    clf_dt.fit(X_data[train], y_data[train])
    acc = clf_dt.score(X_data[val], y_data[val])
    DT_scores.append(acc)
    print(f"Dtree: Fold {n} ACC: {round(acc, 2)}")

    print("---------------------")

    n += 1
    


#### While xgboost performed constantly at 100%, the descision tree classifier har a larger variance in it's results. Varying from 85% to 100% depending on the fold of the data. 

|Classifier|Best|Worst|Mean|Std|
|----------|-----|-----|-------|-----|
|Decision tree classifier| 100% | 85% |96%|0.05
|Xgboost | 100% | 100% | 100% | 0 |


#### In conclusion, the xgboost classifier outperforms the Decision Tree Classifier with a constant 100% score. Please note, however, that the dataset is very small and overfitting should be completely ruled out. It would be very interesting to test the model on a larger set of unseen data.