## Collecting Data

In [1]:
import os

os.listdir()

['.git',
 '.ipynb_checkpoints',
 'Collecting Data.ipynb',
 'Hello.ipynb',
 'README.md',
 'collect_words_dataset.sh',
 'datasets',
 'launch_notes.bat']

In [2]:
with open("collect_words_dataset.sh", "r") as f:
    print(f.read())


mkdir -p datasets/words && \
wget http://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_acl.tar.gz -O datasets/words-temp.tar.gz && \
tar xzvf datasets/words-temp.tar.gz -C datasets/words && \
rm datasets/words-temp.tar.gz



In [1]:
import "fmt"
import "io/ioutil"

const kitchenReviews = "./datasets/words/processed_acl/kitchen"

positives, err := ioutil.ReadFile(kitchenReviews + "/positive.review")
negatives, err2 := ioutil.ReadFile(kitchenReviews + "/negative.review")

if err != nil || err2 != nil {
 fmt.Println("Error(s)", err, err2)
}


In [2]:
import "reflect"

fmt.Println(reflect.TypeOf(positives))

[]uint8


8 <nil>

In [6]:
fmt.Println(string(positives)[:100])

them_it:1 hovering:1 and_occasional:1 cousin_the:2 fictional_baudelaire:1 their_struggles:1 unfortun


101 <nil>

In [7]:
type Pair struct {
  Phrase string
  Frequency int
}

In [11]:
import "strings"

pairsPositive := strings.Fields(string(positives))
pairsNegative := strings.Fields(string(negatives))

fmt.Println(reflect.TypeOf(pairsPositive))

[]string


9 <nil>

In [13]:
fmt.Println(pairsPositive[:4])

[them_it:1 hovering:1 and_occasional:1 cousin_the:2]


53 <nil>

In [14]:
pairsPositive[0]

them_it:1

In [15]:
pairsPositive[1]

hovering:1

In [17]:
import "strconv"

// pairsAndFilters returns a slice of Pair, split by : to obtain the phrase and frequency,
// as well as a map of the phrases that can be used as a lookup table later.
func pairsAndFilters(splitPairs []string) ([]Pair, map[string]bool) {
  var (
    pairs []Pair
    m map[string]bool
  )
  m = make(map[string]bool)
  for _, pair := range splitPairs {
    p := strings.Split(pair, ":")
    phrase := p[0]
    m[phrase] = true
    if len(p) < 2 {
      continue
    }
    freq, err := strconv.Atoi(p[1])
    if err != nil {
      continue
    }
    pairs = append(pairs, Pair{
      Phrase: phrase,
      Frequency: freq,
    })
  }
  return pairs, m
}


In [18]:
// exclude returns a slice of Pair that does not contain the phrases in the exclusion map
func exclude(pairs []Pair, exclusions map[string]bool) []Pair {
  var ret []Pair
  for i := range pairs {
    if !exclusions[pairs[i].Phrase] {
      ret = append(ret, pairs[i])
    }
  }
  return ret
}


In [19]:
parsedPositives, posPhrases := pairsAndFilters(pairsPositive)
parsedNegatives, negPhrases := pairsAndFilters(pairsNegative)
parsedPositives = exclude(parsedPositives, negPhrases)
parsedNegatives = exclude(parsedNegatives, posPhrases)


In [22]:
reflect.TypeOf(parsedPositives)

[]struct { Phrase string; Frequency int }

In [23]:
reflect.TypeOf(parsedNegatives)

[]struct { Phrase string; Frequency int }

## Using [gota](https://godoc.org/github.com/kniren/gota/dataframe)

In [30]:
import "github.com/kniren/gota/dataframe"

dfPos := dataframe.LoadStructs(parsedPositives)
dfNeg := dataframe.LoadStructs(parsedNegatives)

In [31]:
dfPos = dfPos.Arrange(dataframe.RevSort("Frequency"))
dfNeg = dfNeg.Arrange(dataframe.RevSort("Frequency"))

In [32]:
dfPos

[46383x2] DataFrame

    Phrase       Frequency
 0: tic-tac-toe  10       
 1: wusthoff     7        
 2: emperor      7        
 3: shot_glasses 6        
 4: pulp         6        
 5: games        6        
 6: sentry       6        
 7: gravel       6        
 8: the_emperor  5        
 9: aebleskivers 5        
    ...          ...      
    <string>     <int>    


In [33]:
dfNeg

[45760x2] DataFrame

    Phrase          Frequency
 0: seeds           9        
 1: perculator      7        
 2: probes          7        
 3: cork            7        
 4: coffee_tank     5        
 5: brookstone      5        
 6: convection_oven 5        
 7: black_goo       5        
 8: waring_pro      5        
 9: packs           5        
    ...             ...      
    <string>        <int>    


## General gota overview

In [3]:
import "github.com/kniren/gota/dataframe"
import "github.com/kniren/gota/series"

In [4]:
dataframe

{dataframe "github.com/kniren/gota/dataframe", 17 binds, 7 types}

In [5]:
series

{series "github.com/kniren/gota/series", 16 binds, 7 types}

In [7]:
import "fmt"
import "reflect"

fmt.Println(reflect.TypeOf(dataframe))

*fast.Import


13 <nil>

In [8]:
fmt.Println(reflect.TypeOf(series))

*fast.Import


13 <nil>

In [10]:
df := dataframe.LoadRecords(
    [][]string{
        []string{"A", "B", "C", "D"},
        []string{"a", "4", "5.1", "true"},
        []string{"k", "5", "7.0", "true"},
        []string{"k", "4", "6.0", "true"},
        []string{"a", "2", "7.1", "false"},
    },
)

In [11]:
df

[4x4] DataFrame

    A        B     C        D     
 0: a        4     5.100000 true  
 1: k        5     7.000000 true  
 2: k        4     6.000000 true  
 3: a        2     7.100000 false 
    <string> <int> <float>  <bool>


In [12]:
type User struct {
    Name     string
    Age      int
    Accuracy float64
    ignored  bool  // ignored since unexported
}

users := []User{
    {"Aram", 17, 0.2, true},
    {"Juan", 18, 0.8, true},
    {"Ana", 22, 0.5, true},
}

df := dataframe.LoadStructs(users)

In [13]:
df

[3x4] DataFrame

    Name     Age   Accuracy 𒀸ignored
 0: Aram     17    0.200000 true    
 1: Juan     18    0.800000 true    
 2: Ana      22    0.500000 true    
    <string> <int> <float>  <bool>  


***