# 4-CreateDatasets
This tutorial shows a basic template to create a dataset computationally.

In [1]:
from pyspark import SparkContext
from pyspark.sql import Row, SparkSession
from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.io import mmtfReader

#### Configure Spark Session and Spark Context

In [2]:
spark = SparkSession.builder.master("local[4]").appName("4-CreateDatasets").getOrCreate()
sc = spark.sparkContext

## Read a 10% fraction of the sample file
Reading a random fraction of the input file is a good strategy to test some new functionality.

In [3]:
path = "../resources/mmtf_full_sample"
pdb = mmtfReader.read_sequence_file(path, sc, fraction=0.1)

# Creating a dataset in 3 simple steps

## Step 1: calculate properties for a structure and add it to a Row object

In [4]:
def calcProperties(s):
    # s[0] pdb id
    # s[1] mmtf structure record
    return Row(s[0], s[1].num_models, s[1].num_chains, s[1].num_groups, s[1].num_atoms, s[1].num_bonds)

## Step 2: map structures to rows
Here we use a lambda expression to calculate properties.

In [5]:
rows = pdb.map(lambda s: calcProperties(s))

## Step 3: convert RDD of Rows to a dataset

In [6]:
col_names = ["pdbId", "models", "chains", "groups", "atoms", "bonds"]
summary = pythonRDDToDataset.get_dataset(rows, col_names) 
# summary = spark.createDataFrame(rows, col_names) # alternative method, converts int to long

## Done: Show some details about this dataset

In [7]:
summary.columns

['pdbId', 'models', 'chains', 'groups', 'atoms', 'bonds']

In [8]:
summary.printSchema()

root
 |-- pdbId: string (nullable = false)
 |-- models: integer (nullable = false)
 |-- chains: integer (nullable = false)
 |-- groups: integer (nullable = false)
 |-- atoms: integer (nullable = false)
 |-- bonds: integer (nullable = false)



In [9]:
summary.show()

+-----+------+------+------+-----+-----+
|pdbId|models|chains|groups|atoms|bonds|
+-----+------+------+------+-----+-----+
| 1LGH|     1|    68|   512| 5436| 5526|
| 1LJ8|     1|     3|   930| 4310| 3965|
| 1LMI|     1|     2|   303| 1139|  989|
| 1LQV|     1|    30|   862| 4048| 3695|
| 1LRI|     1|     4|   199|  861|  777|
| 4XPQ|     1|    11|  1254| 6027| 5613|
| 4XUO|     1|     6|   373| 2459| 2436|
| 4XWJ|     1|     4|   264| 1879| 1879|
| 4XXI|     1|     6|   357| 2451| 2424|
| 6F8P|     1|     2|   637| 2668| 2389|
| 6C8R|     1|     7|  1382| 6524| 6022|
| 4YNX|     1|     6|   129|  706|  667|
| 4YOR|     1|     6|   837| 3955| 3630|
| 4YPC|     1|     2|   192|  850|  746|
| 1GP6|     1|     8|   713| 3186| 2892|
| 1GPP|     1|     2|   520| 2060| 1789|
| 1GWE|     1|     6|  1380| 5144| 4388|
| 1C4O|     1|     8|   846| 4453| 4196|
| 1F1M|     1|    14|  1286| 5534| 4920|
| 1G8M|     1|     7|  1927| 9795| 9219|
+-----+------+------+------+-----+-----+
only showing top

#### Print statistics for the numerical columns

In [10]:
summary.describe(col_names[1:]).toPandas()

Unnamed: 0,summary,models,chains,groups,atoms,bonds
0,count,1003.0,1003.0,1003.0,1003.0,1003.0
1,mean,1.0,8.563310069790628,704.6380857427716,3539.3868394815554,3282.731804586241
2,stddev,0.0,7.25805559832876,434.0678539370103,2116.16935909995,1998.03065188792
3,min,1.0,2.0,43.0,183.0,171.0
4,max,1.0,68.0,2598.0,9956.0,9692.0


In [11]:
spark.stop()