In [1]:
-- Start with just one simple import.
import qualified DataFrame as D

df <- D.readCsv "./examples/data/housing.csv"



In [2]:
-- Get a quick overview of your data.
D.describeColumns df

  ------------------------------------------------------------------------------------  
| Column Name<br>Text | # Non-null Values<br>Int | # Null Values<br>Int | Type<br>Text |
| --------------------|--------------------------|----------------------|------------- |
| total_bedrooms      | 20433                    | 207                  | Maybe Double |
| ocean_proximity     | 20640                    | 0                    | Text         |
| median_house_value  | 20640                    | 0                    | Double       |
| median_income       | 20640                    | 0                    | Double       |
| households          | 20640                    | 0                    | Double       |
| population          | 20640                    | 0                    | Double       |
| total_rooms         | 20640                    | 0                    | Double       |
| housing_median_age  | 20640                    | 0                    | Double       |
| latitude            | 20640                    | 0                    | Double       |
| longitude           | 20640                    | 0                    | Double       |


In [3]:
import qualified DataFrame.Functions as F

-- Create auto-complete-ready references that you can use in expressions.
F.declareColumns df

In [4]:
import qualified DataFrame.Display.Web.Plot as Plt
import System.Random

df |> D.derive "is_expensive" (F.ifThenElse (median_house_value .>= 450000) "Expensive" "NotExpensive")
   |> D.sample (mkStdGen 42) 0.1 -- Plot a random sample of 10% of the points.
   |> Plt.plotScatterBy "longitude" "latitude" "is_expensive"

In [5]:
D.frequencies "ocean_proximity" df

  ----------------------------------------------------------------------------------------------------------  
| Statistic<br>Text | <1H OCEAN<br>Any | INLAND<br>Any | ISLAND<br>Any | NEAR BAY<br>Any | NEAR OCEAN<br>Any |
| ------------------|------------------|---------------|---------------|-----------------|------------------ |
| Count             | 9136             | 6551          | 5             | 2290            | 2658              |
| Percentage (%)    | 44.26%           | 31.74%        | 0.02%         | 11.09%          | 12.88%            |


In [6]:
-- Typed columns mean fewer bugs.
data OceanProximity = ISLAND | INLAND | NEAR_BAY | NEAR_OCEAN | LESS_THAN_HOUR | UNKNOWN deriving (Show, Eq, Read, Ord)

mapping = [("ISLAND", ISLAND), ("INLAND", INLAND), ("NEAR_BAY", NEAR_BAY), ("NEAR_OCEAN", NEAR_OCEAN), ("<1H OCEAN", LESS_THAN_HOUR)]

typedDf = D.derive "ocean_proximity" (F.recodeWithDefault UNKNOWN mapping ocean_proximity) df

-- Redeclare columns now that our types have changed.
F.declareColumns typedDf

In [7]:
-- No more typos or spurious comparison bugs.
typedDf |> D.filterWhere (median_house_value .>= 400000 .&& (ocean_proximity .== F.lit INLAND))
        |> D.take 10

  ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------  
| longitude<br>Double | latitude<br>Double | housing_median_age<br>Double | total_rooms<br>Double | total_bedrooms<br>Maybe Double | population<br>Double | households<br>Double | median_income<br>Double | median_house_value<br>Double | ocean_proximity<br>OceanProximity |
| --------------------|--------------------|------------------------------|-----------------------|--------------------------------|----------------------|----------------------|-------------------------|------------------------------|---------------------------------- |
| -121.88             | 37.64              | 20.0                         | 1309.0                | Just 184.0                     | 514.0                | 172.0                | 10.9506                 | 475800.0                     | INLAND                            |
| -121.85             | 37.72              | 43.0                         | 228.0                 | Just 40.0                      | 83.0                 | 42.0                 | 10.3203                 | 400000.0                     | INLAND                            |
| -121.62             | 37.61              | 26.0                         | 1786.0                | Just 306.0                     | 771.0                | 279.0                | 5.7239                  | 430600.0                     | INLAND                            |
| -121.72             | 37.7               | 17.0                         | 1671.0                | Just 352.0                     | 729.0                | 252.0                | 6.1023                  | 450000.0                     | INLAND                            |
| -121.77             | 37.74              | 25.0                         | 494.0                 | Just 81.0                      | 254.0                | 85.0                 | 9.1531                  | 418800.0                     | INLAND                            |
| -121.97             | 37.87              | 4.0                          | 1029.0                | Just 126.0                     | 416.0                | 122.0                | 13.4883                 | 500001.0                     | INLAND                            |
| -121.96             | 37.85              | 10.0                         | 3209.0                | Just 379.0                     | 1199.0               | 392.0                | 12.2478                 | 500001.0                     | INLAND                            |
| -121.94             | 37.83              | 11.0                         | 2836.0                | Just 373.0                     | 959.0                | 335.0                | 10.5815                 | 500001.0                     | INLAND                            |
| -121.89             | 37.82              | 4.0                          | 11444.0               | Just 1355.0                    | 3898.0               | 1257.0               | 13.2949                 | 500001.0                     | INLAND                            |
| -121.91             | 37.81              | 7.0                          | 3477.0                | Just 416.0                     | 1216.0               | 395.0                | 13.1499                 | 500001.0                     | INLAND                            |


In [8]:
-- Helps us define names for grouped functions. 
import DataFrame.Functions ((.=))

df |> D.groupBy [F.name ocean_proximity]
   |> D.aggregate [ "total_house_value" .= F.sum median_house_value
                  , "average_rooms_per_household" .= F.mean (total_rooms / households)
                  ]

  ---------------------------------------------------------------------------------------------  
| ocean_proximity<br>Text | total_house_value<br>Double | average_rooms_per_household<br>Double |
| ------------------------|-----------------------------|-------------------------------------- |
| NEAR OCEAN              | 6.62995512e8                | 5.206007549366366                     |
| INLAND                  | 8.17600123e8                | 5.977265079384676                     |
| <1H OCEAN               | 2.193410032e9               | 5.1525760933659015                    |
| NEAR BAY                | 5.93596194e8                | 5.22170524136999                      |
| ISLAND                  | 1902200.0                   | 5.65657716330408                      |


## Like what you see?
* See the full example with linear regression (using HaskTorch) in the examples folder.
* Join the DataHaskell discord.